You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/06/03 18:52:31 UTC
[2/2] tika git commit: TIKA-1994 -- Integrate TesseractOCR with full
page image rendering for PDFs
TIKA-1994 -- Integrate TesseractOCR with full page image rendering for PDFs
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/ebe70289
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/ebe70289
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/ebe70289
Branch: refs/heads/2.x
Commit: ebe70289815776f6ce6c271c7faf8d23cfd31337
Parents: e5a7604
Author: tballison <ta...@mitre.org>
Authored: Fri Jun 3 14:52:19 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Fri Jun 3 14:52:19 2016 -0400
----------------------------------------------------------------------
CHANGES.txt | 2 +
.../apache/tika/module/journal/BundleIT.java | 2 +-
.../tika-parser-pdf-bundle/pom.xml | 21 +-
.../org/apache/tika/module/pdf/BundleIT.java | 2 +-
.../tika-parser-multimedia-module/pom.xml | 6 -
.../tika/parser/ocr/TesseractOCRParser.java | 93 ++-
.../tika/parser/ocr/TesseractOCRParserTest.java | 527 +++++++++--------
.../tika-parser-pdf-module/pom.xml | 5 +
.../tika/parser/pdf/AbstractPDF2XHTML.java | 575 +++++++++++++++++++
.../org/apache/tika/parser/pdf/OCR2XHTML.java | 125 ++++
.../org/apache/tika/parser/pdf/PDF2XHTML.java | 498 +---------------
.../org/apache/tika/parser/pdf/PDFParser.java | 8 +
.../apache/tika/parser/pdf/PDFParserConfig.java | 274 ++++++---
.../apache/tika/parser/pdf/PDFParser.properties | 10 +-
.../apache/tika/parser/pdf/PDFParserTest.java | 37 ++
15 files changed, 1322 insertions(+), 863 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/ebe70289/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index f359484..fbc2236 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -10,6 +10,8 @@ Release 2.0 - Future Development
Release 1.14 - ???
+ * Integrate TesseractOCR with full page image rendering for PDFs (TIKA-1994).
+
* Add mime detection via Nick C and parser for DBF files (TIKA-1513).
* Add mime detection and parsers for MSOffice 2003 XML Word
http://git-wip-us.apache.org/repos/asf/tika/blob/ebe70289/tika-parser-bundles/tika-parser-journal-bundle/src/test/java/org/apache/tika/module/journal/BundleIT.java
----------------------------------------------------------------------
diff --git a/tika-parser-bundles/tika-parser-journal-bundle/src/test/java/org/apache/tika/module/journal/BundleIT.java b/tika-parser-bundles/tika-parser-journal-bundle/src/test/java/org/apache/tika/module/journal/BundleIT.java
index 6d65164..c8e8448 100644
--- a/tika-parser-bundles/tika-parser-journal-bundle/src/test/java/org/apache/tika/module/journal/BundleIT.java
+++ b/tika-parser-bundles/tika-parser-journal-bundle/src/test/java/org/apache/tika/module/journal/BundleIT.java
@@ -92,6 +92,6 @@ public class BundleIT {
@Test
public void testServicesCreated() throws Exception {
ServiceReference[] services = bc.getAllServiceReferences(Parser.class.getName(), null);
- assertEquals("Not all Services have started", 2, services.length);
+ assertEquals("Not all Services have started", 16, services.length);
}
}
http://git-wip-us.apache.org/repos/asf/tika/blob/ebe70289/tika-parser-bundles/tika-parser-pdf-bundle/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-bundles/tika-parser-pdf-bundle/pom.xml b/tika-parser-bundles/tika-parser-pdf-bundle/pom.xml
index dbd65e1..25eef2e 100644
--- a/tika-parser-bundles/tika-parser-pdf-bundle/pom.xml
+++ b/tika-parser-bundles/tika-parser-pdf-bundle/pom.xml
@@ -47,6 +47,7 @@
<Bundle-Activator>org.apache.tika.module.pdf.internal.Activator</Bundle-Activator>
<Embed-Dependency>
tika-parser-pdf-module;inline=true,
+ tika-parser-multimedia-module;inline=true,
tika-parser-xmp-commons;inline=true,
commons-io;inline=true,
pdfbox;inline=true,
@@ -65,6 +66,22 @@
<Import-Package>
*,
com.ibm.icu.text;resolution:=optional,
+ com.coremedia.iso;resolution:=optional,
+ com.coremedia.iso.boxes;resolution:=optional,
+ com.coremedia.iso.boxes.apple;resolution:=optional,
+ com.coremedia.iso.boxes.sampleentry;resolution:=optional,
+ com.drew.imaging.jpeg;resolution:=optional,
+ com.drew.imaging.riff;resolution:=optional,
+ com.drew.imaging.tiff;resolution:=optional,
+ com.drew.imaging.webp;resolution:=optional,
+ com.drew.lang;resolution:=optional,
+ com.drew.metadata;resolution:=optional,
+ com.drew.metadata.exif;resolution:=optional,
+ com.drew.metadata.iptc;resolution:=optional,
+ com.drew.metadata.jpeg;resolution:=optional,
+ com.googlecode.mp4parser;resolution:=optional,
+ com.googlecode.mp4parser.boxes.apple;resolution:=optional,
+ com.googlecode.mp4parser.util;resolution:=optional,
javax.mail;resolution:=optional,
javax.mail.internet;resolution:=optional,
org.bouncycastle.cert;resolution:=optional,
@@ -73,7 +90,9 @@
org.bouncycastle.cms.bc;resolution:=optional,
org.bouncycastle.operator;resolution:=optional,
org.bouncycastle.operator.bc;resolution:=optional,
- org.bouncycastle.tsp;resolution:=optional
+ org.bouncycastle.tsp;resolution:=optional,
+ org.apache.commons.exec;resolution:=optional,
+ org.apache.commons.exec.environment;resolution:=optional
</Import-Package>
</instructions>
</configuration>
http://git-wip-us.apache.org/repos/asf/tika/blob/ebe70289/tika-parser-bundles/tika-parser-pdf-bundle/src/test/java/org/apache/tika/module/pdf/BundleIT.java
----------------------------------------------------------------------
diff --git a/tika-parser-bundles/tika-parser-pdf-bundle/src/test/java/org/apache/tika/module/pdf/BundleIT.java b/tika-parser-bundles/tika-parser-pdf-bundle/src/test/java/org/apache/tika/module/pdf/BundleIT.java
index bbc72bb..8e1d010 100644
--- a/tika-parser-bundles/tika-parser-pdf-bundle/src/test/java/org/apache/tika/module/pdf/BundleIT.java
+++ b/tika-parser-bundles/tika-parser-pdf-bundle/src/test/java/org/apache/tika/module/pdf/BundleIT.java
@@ -91,6 +91,6 @@ public class BundleIT {
@Test
public void testServicesCreated() throws Exception {
ServiceReference[] services = bc.getAllServiceReferences(Parser.class.getName(), null);
- assertEquals("Not all Services have started", 1, services.length);
+ assertEquals("Not all Services have started", 15, services.length);
}
}
http://git-wip-us.apache.org/repos/asf/tika/blob/ebe70289/tika-parser-modules/tika-parser-multimedia-module/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/pom.xml b/tika-parser-modules/tika-parser-multimedia-module/pom.xml
index 0192b8b..7a3a704 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/pom.xml
+++ b/tika-parser-modules/tika-parser-multimedia-module/pom.xml
@@ -83,12 +83,6 @@
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-pdf-module</artifactId>
- <version>${project.version}</version>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
<artifactId>tika-parser-office-module</artifactId>
<version>${project.version}</version>
<scope>test</scope>
http://git-wip-us.apache.org/repos/asf/tika/blob/ebe70289/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
index 7db29c8..83fe7fe 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
@@ -16,9 +16,7 @@
*/
package org.apache.tika.parser.ocr;
-import javax.imageio.ImageIO;
-
-import java.awt.Image;
+import java.awt.*;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileInputStream;
@@ -40,6 +38,7 @@ import java.util.concurrent.FutureTask;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
+import javax.imageio.ImageIO;
import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.LogFactory;
import org.apache.tika.exception.TikaException;
@@ -56,10 +55,10 @@ import org.apache.tika.parser.external.ExternalParser;
import org.apache.tika.parser.image.ImageParser;
import org.apache.tika.parser.image.TiffParser;
import org.apache.tika.parser.jpeg.JpegParser;
+import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
-
import static java.nio.charset.StandardCharsets.UTF_8;
/**
@@ -110,7 +109,7 @@ public class TesseractOCRParser extends AbstractParser {
}
}
- private boolean hasTesseract(TesseractOCRConfig config) {
+ public boolean hasTesseract(TesseractOCRConfig config) {
// Fetch where the config says to find Tesseract
String tesseract = config.getTesseractPath() + getTesseractProg();
@@ -157,47 +156,90 @@ public class TesseractOCRParser extends AbstractParser {
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
TesseractOCRConfig config = context.get(TesseractOCRConfig.class, DEFAULT_CONFIG);
-
// If Tesseract is not on the path with the current config, do not try to run OCR
// getSupportedTypes shouldn't have listed us as handling it, so this should only
// occur if someone directly calls this parser, not via DefaultParser or similar
if (! hasTesseract(config))
return;
- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ TemporaryResources tmp = new TemporaryResources();
+ try {
+ TikaInputStream tikaStream = TikaInputStream.get(stream, tmp);
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ File tmpImgFile = tmp.createTemporaryFile();
+ parse(tikaStream, tmpImgFile, xhtml, config);
+ // Temporary workaround for TIKA-1445 - until we can specify
+ // composite parsers with strategies (eg Composite, Try In Turn),
+ // always send the image onwards to the regular parser to have
+ // the metadata for them extracted as well
+ _TMP_IMAGE_METADATA_PARSER.parse(tikaStream, new EmbeddedContentHandler(xhtml), metadata, context);
+ xhtml.endDocument();
+ } finally {
+ tmp.dispose();
+ }
+ }
+
+ /**
+ * Use this to parse content without starting a new document.
+ * This appends SAX events to xhtml without re-adding the metadata, body start, etc.
+ * @param stream inputstream
+ * @param xhtml handler
+ * @param config TesseractOCRConfig to use for this parse
+ * @throws IOException
+ * @throws SAXException
+ * @throws TikaException
+ */
+ public void parseInline(InputStream stream, XHTMLContentHandler xhtml, TesseractOCRConfig config)
+ throws IOException, SAXException, TikaException {
+ // If Tesseract is not on the path with the current config, do not try to run OCR
+ // getSupportedTypes shouldn't have listed us as handling it, so this should only
+ // occur if someone directly calls this parser, not via DefaultParser or similar
+ if (! hasTesseract(config))
+ return;
TemporaryResources tmp = new TemporaryResources();
- File output = null;
try {
TikaInputStream tikaStream = TikaInputStream.get(stream, tmp);
- File input = tikaStream.getFile();
- long size = tikaStream.getLength();
+ File tmpImgFile = tmp.createTemporaryFile();
+ parse(tikaStream, tmpImgFile, xhtml, config);
+ } finally {
+ tmp.dispose();
+ }
+
+ }
+
+ private void parse(TikaInputStream tikaInputStream, File tmpImgFile, XHTMLContentHandler xhtml, TesseractOCRConfig config)
+ throws IOException, SAXException, TikaException {
+ File tmpTxtOutput = null;
+
+ try {
+ File input = tikaInputStream.getFile();
+ long size = tikaInputStream.getLength();
if (size >= config.getMinFileSizeToOcr() && size <= config.getMaxFileSizeToOcr()) {
- output = tmp.createTemporaryFile();
- doOCR(input, output, config);
+ doOCR(input, tmpImgFile, config);
// Tesseract appends .txt to output file name
- output = new File(output.getAbsolutePath() + ".txt");
+ tmpTxtOutput = new File(tmpImgFile.getAbsolutePath() + ".txt");
- if (output.exists())
- extractOutput(new FileInputStream(output), xhtml);
+ if (tmpTxtOutput.exists()) {
+ try (InputStream is = new FileInputStream(tmpTxtOutput)) {
+ extractOutput(is, xhtml);
+ }
+ }
}
- // Temporary workaround for TIKA-1445 - until we can specify
- // composite parsers with strategies (eg Composite, Try In Turn),
- // always send the image onwards to the regular parser to have
- // the metadata for them extracted as well
- _TMP_IMAGE_METADATA_PARSER.parse(tikaStream, handler, metadata, context);
} finally {
- tmp.dispose();
- if (output != null) {
- output.delete();
+ if (tmpTxtOutput != null) {
+ tmpTxtOutput.delete();
}
}
}
+
// TIKA-1445 workaround parser
private static Parser _TMP_IMAGE_METADATA_PARSER = new CompositeImageParser();
private static class CompositeImageParser extends CompositeParser {
@@ -283,8 +325,7 @@ public class TesseractOCRParser extends AbstractParser {
*/
private void extractOutput(InputStream stream, XHTMLContentHandler xhtml) throws SAXException, IOException {
- xhtml.startDocument();
- xhtml.startElement("div");
+ xhtml.startElement("div", "class", "ocr");
try (Reader reader = new InputStreamReader(stream, UTF_8)) {
char[] buffer = new char[1024];
for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
@@ -293,7 +334,7 @@ public class TesseractOCRParser extends AbstractParser {
}
}
xhtml.endElement("div");
- xhtml.endDocument();
+
}
/**
http://git-wip-us.apache.org/repos/asf/tika/blob/ebe70289/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
index e99f6ae..9ab958e 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
@@ -1,265 +1,262 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.ocr;
-
-import static org.apache.tika.parser.ocr.TesseractOCRParser.getTesseractProg;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertTrue;
-import static org.junit.Assert.fail;
-import static org.junit.Assume.assumeTrue;
-import static org.mockito.Matchers.any;
-import static org.mockito.Matchers.eq;
-import static org.mockito.Mockito.mock;
-import static org.mockito.Mockito.times;
-import static org.mockito.Mockito.verify;
-
-import java.io.InputStream;
-import java.util.List;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.DefaultParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.RecursiveParserWrapper;
-import org.apache.tika.parser.external.ExternalParser;
-import org.apache.tika.parser.image.ImageParser;
-import org.apache.tika.parser.mail.RFC822Parser;
-import org.apache.tika.parser.pdf.PDFParserConfig;
-import org.apache.tika.sax.BasicContentHandlerFactory;
-import org.apache.tika.sax.BodyContentHandler;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.junit.Test;
-import org.xml.sax.Attributes;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.helpers.DefaultHandler;
-
-public class TesseractOCRParserTest extends TikaTest {
-
- public static boolean canRun() {
- TesseractOCRConfig config = new TesseractOCRConfig();
- TesseractOCRParserTest tesseractOCRTest = new TesseractOCRParserTest();
- return tesseractOCRTest.canRun(config);
- }
-
- private boolean canRun(TesseractOCRConfig config) {
- String[] checkCmd = {config.getTesseractPath() + getTesseractProg()};
- // If Tesseract is not on the path, do not run the test.
- return ExternalParser.check(checkCmd);
- }
-
- /*
- Check that if Tesseract is not found, the TesseractOCRParser claims to not support
- any file types. So, the standard image parser is called instead.
- */
- @Test
- public void offersNoTypesIfNotFound() throws Exception {
- TesseractOCRParser parser = new TesseractOCRParser();
- DefaultParser defaultParser = new DefaultParser();
- MediaType png = MediaType.image("png");
-
- // With an invalid path, will offer no types
- TesseractOCRConfig invalidConfig = new TesseractOCRConfig();
- invalidConfig.setTesseractPath("/made/up/path");
-
- ParseContext parseContext = new ParseContext();
- parseContext.set(TesseractOCRConfig.class, invalidConfig);
-
- // No types offered
- assertEquals(0, parser.getSupportedTypes(parseContext).size());
-
- // And DefaultParser won't use us
- assertEquals(ImageParser.class, defaultParser.getParsers(parseContext).get(png).getClass());
- }
-
- /*
- If Tesseract is found, test we retrieve the proper number of supporting Parsers.
- */
- @Test
- public void offersTypesIfFound() throws Exception {
- TesseractOCRParser parser = new TesseractOCRParser();
- DefaultParser defaultParser = new DefaultParser();
-
- ParseContext parseContext = new ParseContext();
- MediaType png = MediaType.image("png");
-
- // Assuming that Tesseract is on the path, we should find 5 Parsers that support PNG.
- assumeTrue(canRun());
-
- assertEquals(5, parser.getSupportedTypes(parseContext).size());
- assertTrue(parser.getSupportedTypes(parseContext).contains(png));
-
- // DefaultParser will now select the TesseractOCRParser.
- assertEquals(TesseractOCRParser.class, defaultParser.getParsers(parseContext).get(png).getClass());
- }
-
- @Test
- public void testPDFOCR() throws Exception {
- String resource = "/test-documents/testOCR.pdf";
- String[] nonOCRContains = new String[0];
- testBasicOCR(resource, nonOCRContains, 2);
- }
-
- @Test
- public void testDOCXOCR() throws Exception {
- String resource = "/test-documents/testOCR.docx";
- String[] nonOCRContains = {
- "This is some text.",
- "Here is an embedded image:"
- };
- testBasicOCR(resource, nonOCRContains, 3);
- }
-
- @Test
- public void testPPTXOCR() throws Exception {
- String resource = "/test-documents/testOCR.pptx";
- String[] nonOCRContains = {
- "This is some text"
- };
- testBasicOCR(resource, nonOCRContains, 3);
- }
-
- private void testBasicOCR(String resource, String[] nonOCRContains, int numMetadatas) throws Exception {
- TesseractOCRConfig config = new TesseractOCRConfig();
- Parser parser = new RecursiveParserWrapper(new AutoDetectParser(),
- new BasicContentHandlerFactory(
- BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
-
- PDFParserConfig pdfConfig = new PDFParserConfig();
- pdfConfig.setExtractInlineImages(true);
-
- ParseContext parseContext = new ParseContext();
- parseContext.set(TesseractOCRConfig.class, config);
- parseContext.set(Parser.class, parser);
- parseContext.set(PDFParserConfig.class, pdfConfig);
-
- try (InputStream stream = TesseractOCRParserTest.class.getResourceAsStream(resource)) {
- parser.parse(stream, new DefaultHandler(), new Metadata(), parseContext);
- }
- List<Metadata> metadataList = ((RecursiveParserWrapper) parser).getMetadata();
- assertEquals(numMetadatas, metadataList.size());
-
- StringBuilder contents = new StringBuilder();
- for (Metadata m : metadataList) {
- contents.append(m.get(RecursiveParserWrapper.TIKA_CONTENT));
- }
- if (canRun()) {
- assertTrue(contents.toString().contains("Happy New Year 2003!"));
- }
- for (String needle : nonOCRContains) {
- assertContains(needle, contents.toString());
- }
- assertTrue(metadataList.get(0).names().length > 10);
- assertTrue(metadataList.get(1).names().length > 10);
- //test at least one value
- assertEquals("deflate", metadataList.get(1).get("Compression CompressionTypeName"));
- }
-
- @Test
- public void testSingleImage() throws Exception {
- assumeTrue(canRun());
- String xml = getXML("testOCR.jpg").xml;
- assertContains("OCR Testing", xml);
- }
-
- @Test
- public void getNormalMetadataToo() throws Exception {
- //this should be successful whether or not TesseractOCR is installed/active
- //If tesseract is installed, the internal metadata extraction parser should
- //work; and if tesseract isn't installed, the regular parsers should take over.
-
- //gif
- Metadata m = getXML("testGIF.gif").metadata;
- assertTrue(m.names().length > 20);
- assertEquals("RGB", m.get("Chroma ColorSpaceType"));
-
- //jpg
- m = getXML("testOCR.jpg").metadata;
- assertEquals("136", m.get(Metadata.IMAGE_WIDTH));
- assertEquals("66", m.get(Metadata.IMAGE_LENGTH));
- assertEquals("8", m.get(Metadata.BITS_PER_SAMPLE));
- assertEquals(null, m.get(Metadata.SAMPLES_PER_PIXEL));
- assertContains("This is a test Apache Tika imag", m.get(Metadata.COMMENTS));
-
- //bmp
- m = getXML("testBMP.bmp").metadata;
- assertEquals("100", m.get(Metadata.IMAGE_WIDTH));
- assertEquals("75", m.get(Metadata.IMAGE_LENGTH));
-
- //png
- m = getXML("testPNG.png").metadata;
- assertEquals("100", m.get(Metadata.IMAGE_WIDTH));
- assertEquals("75", m.get(Metadata.IMAGE_LENGTH));
- assertEquals("UnsignedIntegral", m.get("Data SampleFormat"));
-
- //tiff
- m = getXML("testTIFF.tif").metadata;
- assertEquals("100", m.get(Metadata.IMAGE_WIDTH));
- assertEquals("75", m.get(Metadata.IMAGE_LENGTH));
- assertEquals("72 dots per inch", m.get("Y Resolution"));
- }
-
- @Test
- public void testMultipart() {
- Parser parser = new RFC822Parser();
- Metadata metadata = new Metadata();
- InputStream stream = getStream("test-documents/testRFC822-multipart");
- ContentHandler handler = mock(XHTMLContentHandler.class);
-
- try {
- parser.parse(stream, handler, metadata, new ParseContext());
- verify(handler).startDocument();
- int bodyExpectedTimes = 4, multipackExpectedTimes = 5;
- // TIKA-1422. TesseractOCRParser interferes with the number of times the handler is invoked.
- // But, different versions of Tesseract lead to a different number of invocations. So, we
- // only verify the handler if Tesseract cannot run.
- if (!TesseractOCRParserTest.canRun()) {
- verify(handler, times(bodyExpectedTimes)).startElement(eq(XHTMLContentHandler.XHTML), eq("div"), eq("div"), any(Attributes.class));
- verify(handler, times(bodyExpectedTimes)).endElement(XHTMLContentHandler.XHTML, "div", "div");
- }
- } catch (Exception e) {
- fail("Exception thrown: " + e.getMessage());
- }
-
- //repeat, this time looking at content
- parser = new RFC822Parser();
- metadata = new Metadata();
- stream = getStream("test-documents/testRFC822-multipart");
- handler = new BodyContentHandler();
- try {
- parser.parse(stream, handler, metadata, new ParseContext());
- //tests correct decoding of quoted printable text, including UTF-8 bytes into Unicode
- String bodyText = handler.toString();
- assertTrue(bodyText.contains("body 1"));
- assertTrue(bodyText.contains("body 2"));
- assertFalse(bodyText.contains("R0lGODlhNgE8AMQAA")); //part of encoded gif
- } catch (Exception e) {
- fail("Exception thrown: " + e.getMessage());
- }
- }
-
- private static InputStream getStream(String name) {
- InputStream stream = Thread.currentThread().getContextClassLoader()
- .getResourceAsStream(name);
- assertNotNull("Test file not found " + name, stream);
- return stream;
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.ocr;
+
+import static org.apache.tika.parser.ocr.TesseractOCRParser.getTesseractProg;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+import static org.junit.Assume.assumeTrue;
+import static org.mockito.Matchers.any;
+import static org.mockito.Matchers.eq;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.times;
+import static org.mockito.Mockito.verify;
+
+import java.io.InputStream;
+import java.util.List;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.DefaultParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.RecursiveParserWrapper;
+import org.apache.tika.parser.external.ExternalParser;
+import org.apache.tika.parser.image.ImageParser;
+import org.apache.tika.parser.mail.RFC822Parser;
+import org.apache.tika.sax.BasicContentHandlerFactory;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.junit.Ignore;
+import org.junit.Test;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.helpers.DefaultHandler;
+
+public class TesseractOCRParserTest extends TikaTest {
+
+ public static boolean canRun() {
+ TesseractOCRConfig config = new TesseractOCRConfig();
+ TesseractOCRParserTest tesseractOCRTest = new TesseractOCRParserTest();
+ return tesseractOCRTest.canRun(config);
+ }
+
+ private boolean canRun(TesseractOCRConfig config) {
+ String[] checkCmd = {config.getTesseractPath() + getTesseractProg()};
+ // If Tesseract is not on the path, do not run the test.
+ return ExternalParser.check(checkCmd);
+ }
+
+ /*
+ Check that if Tesseract is not found, the TesseractOCRParser claims to not support
+ any file types. So, the standard image parser is called instead.
+ */
+ @Test
+ public void offersNoTypesIfNotFound() throws Exception {
+ TesseractOCRParser parser = new TesseractOCRParser();
+ DefaultParser defaultParser = new DefaultParser();
+ MediaType png = MediaType.image("png");
+
+ // With an invalid path, will offer no types
+ TesseractOCRConfig invalidConfig = new TesseractOCRConfig();
+ invalidConfig.setTesseractPath("/made/up/path");
+
+ ParseContext parseContext = new ParseContext();
+ parseContext.set(TesseractOCRConfig.class, invalidConfig);
+
+ // No types offered
+ assertEquals(0, parser.getSupportedTypes(parseContext).size());
+
+ // And DefaultParser won't use us
+ assertEquals(ImageParser.class, defaultParser.getParsers(parseContext).get(png).getClass());
+ }
+
+ /*
+ If Tesseract is found, test we retrieve the proper number of supporting Parsers.
+ */
+ @Test
+ public void offersTypesIfFound() throws Exception {
+ TesseractOCRParser parser = new TesseractOCRParser();
+ DefaultParser defaultParser = new DefaultParser();
+
+ ParseContext parseContext = new ParseContext();
+ MediaType png = MediaType.image("png");
+
+ // Assuming that Tesseract is on the path, we should find 5 Parsers that support PNG.
+ assumeTrue(canRun());
+
+ assertEquals(5, parser.getSupportedTypes(parseContext).size());
+ assertTrue(parser.getSupportedTypes(parseContext).contains(png));
+
+ // DefaultParser will now select the TesseractOCRParser.
+ assertEquals(TesseractOCRParser.class, defaultParser.getParsers(parseContext).get(png).getClass());
+ }
+
+ @Test
+ @Ignore("TODO: cyclic reference to pdf-module...maybe move these all to tika-app?")
+ public void testPDFOCR() throws Exception {
+ String resource = "/test-documents/testOCR.pdf";
+ String[] nonOCRContains = new String[0];
+ testBasicOCR(resource, nonOCRContains, 2);
+ }
+
+ @Test
+ public void testDOCXOCR() throws Exception {
+ String resource = "/test-documents/testOCR.docx";
+ String[] nonOCRContains = {
+ "This is some text.",
+ "Here is an embedded image:"
+ };
+ testBasicOCR(resource, nonOCRContains, 3);
+ }
+
+ @Test
+ public void testPPTXOCR() throws Exception {
+ String resource = "/test-documents/testOCR.pptx";
+ String[] nonOCRContains = {
+ "This is some text"
+ };
+ testBasicOCR(resource, nonOCRContains, 3);
+ }
+
+ private void testBasicOCR(String resource, String[] nonOCRContains, int numMetadatas) throws Exception {
+ TesseractOCRConfig config = new TesseractOCRConfig();
+ Parser parser = new RecursiveParserWrapper(new AutoDetectParser(),
+ new BasicContentHandlerFactory(
+ BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
+
+ ParseContext parseContext = new ParseContext();
+ parseContext.set(TesseractOCRConfig.class, config);
+ parseContext.set(Parser.class, parser);
+
+ try (InputStream stream = TesseractOCRParserTest.class.getResourceAsStream(resource)) {
+ parser.parse(stream, new DefaultHandler(), new Metadata(), parseContext);
+ }
+ List<Metadata> metadataList = ((RecursiveParserWrapper) parser).getMetadata();
+ assertEquals(numMetadatas, metadataList.size());
+
+ StringBuilder contents = new StringBuilder();
+ for (Metadata m : metadataList) {
+ contents.append(m.get(RecursiveParserWrapper.TIKA_CONTENT));
+ }
+ if (canRun()) {
+ assertTrue(contents.toString().contains("Happy New Year 2003!"));
+ }
+ for (String needle : nonOCRContains) {
+ assertContains(needle, contents.toString());
+ }
+ assertTrue(metadataList.get(0).names().length > 10);
+ assertTrue(metadataList.get(1).names().length > 10);
+ //test at least one value
+ assertEquals("deflate", metadataList.get(1).get("Compression CompressionTypeName"));
+ }
+
+ @Test
+ public void testSingleImage() throws Exception {
+ assumeTrue(canRun());
+ String xml = getXML("testOCR.jpg").xml;
+ assertContains("OCR Testing", xml);
+ }
+
+ @Test
+ public void getNormalMetadataToo() throws Exception {
+ //this should be successful whether or not TesseractOCR is installed/active
+ //If tesseract is installed, the internal metadata extraction parser should
+ //work; and if tesseract isn't installed, the regular parsers should take over.
+
+ //gif
+ Metadata m = getXML("testGIF.gif").metadata;
+ assertTrue(m.names().length > 20);
+ assertEquals("RGB", m.get("Chroma ColorSpaceType"));
+
+ //jpg
+ m = getXML("testOCR.jpg").metadata;
+ assertEquals("136", m.get(Metadata.IMAGE_WIDTH));
+ assertEquals("66", m.get(Metadata.IMAGE_LENGTH));
+ assertEquals("8", m.get(Metadata.BITS_PER_SAMPLE));
+ assertEquals(null, m.get(Metadata.SAMPLES_PER_PIXEL));
+ assertContains("This is a test Apache Tika imag", m.get(Metadata.COMMENTS));
+
+ //bmp
+ m = getXML("testBMP.bmp").metadata;
+ assertEquals("100", m.get(Metadata.IMAGE_WIDTH));
+ assertEquals("75", m.get(Metadata.IMAGE_LENGTH));
+
+ //png
+ m = getXML("testPNG.png").metadata;
+ assertEquals("100", m.get(Metadata.IMAGE_WIDTH));
+ assertEquals("75", m.get(Metadata.IMAGE_LENGTH));
+ assertEquals("UnsignedIntegral", m.get("Data SampleFormat"));
+
+ //tiff
+ m = getXML("testTIFF.tif").metadata;
+ assertEquals("100", m.get(Metadata.IMAGE_WIDTH));
+ assertEquals("75", m.get(Metadata.IMAGE_LENGTH));
+ assertEquals("72 dots per inch", m.get("Y Resolution"));
+ }
+
+ @Test
+ public void testMultipart() {
+ Parser parser = new RFC822Parser();
+ Metadata metadata = new Metadata();
+ InputStream stream = getStream("test-documents/testRFC822-multipart");
+ ContentHandler handler = mock(XHTMLContentHandler.class);
+
+ try {
+ parser.parse(stream, handler, metadata, new ParseContext());
+ verify(handler).startDocument();
+ int bodyExpectedTimes = 4, multipackExpectedTimes = 5;
+ // TIKA-1422. TesseractOCRParser interferes with the number of times the handler is invoked.
+ // But, different versions of Tesseract lead to a different number of invocations. So, we
+ // only verify the handler if Tesseract cannot run.
+ if (!TesseractOCRParserTest.canRun()) {
+ verify(handler, times(bodyExpectedTimes)).startElement(eq(XHTMLContentHandler.XHTML), eq("div"), eq("div"), any(Attributes.class));
+ verify(handler, times(bodyExpectedTimes)).endElement(XHTMLContentHandler.XHTML, "div", "div");
+ }
+ } catch (Exception e) {
+ fail("Exception thrown: " + e.getMessage());
+ }
+
+ //repeat, this time looking at content
+ parser = new RFC822Parser();
+ metadata = new Metadata();
+ stream = getStream("test-documents/testRFC822-multipart");
+ handler = new BodyContentHandler();
+ try {
+ parser.parse(stream, handler, metadata, new ParseContext());
+ //tests correct decoding of quoted printable text, including UTF-8 bytes into Unicode
+ String bodyText = handler.toString();
+ assertTrue(bodyText.contains("body 1"));
+ assertTrue(bodyText.contains("body 2"));
+ assertFalse(bodyText.contains("R0lGODlhNgE8AMQAA")); //part of encoded gif
+ } catch (Exception e) {
+ fail("Exception thrown: " + e.getMessage());
+ }
+ }
+
+ private static InputStream getStream(String name) {
+ InputStream stream = Thread.currentThread().getContextClassLoader()
+ .getResourceAsStream(name);
+ assertNotNull("Test file not found " + name, stream);
+ return stream;
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/ebe70289/tika-parser-modules/tika-parser-pdf-module/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-pdf-module/pom.xml b/tika-parser-modules/tika-parser-pdf-module/pom.xml
index 2156b95..11f259e 100644
--- a/tika-parser-modules/tika-parser-pdf-module/pom.xml
+++ b/tika-parser-modules/tika-parser-pdf-module/pom.xml
@@ -35,6 +35,11 @@
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-multimedia-module</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
<artifactId>tika-parser-xmp-commons</artifactId>
<version>${project.version}</version>
</dependency>
http://git-wip-us.apache.org/repos/asf/tika/blob/ebe70289/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
new file mode 100644
index 0000000..9a73bde
--- /dev/null
+++ b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -0,0 +1,575 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pdf;
+
+import java.awt.image.BufferedImage;
+import java.io.BufferedInputStream;
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Calendar;
+import java.util.List;
+import java.util.ListIterator;
+import java.util.Locale;
+import java.util.Map;
+import java.util.TreeMap;
+
+import javax.xml.stream.XMLStreamException;
+import org.apache.commons.io.IOExceptionWithCause;
+import org.apache.commons.io.IOUtils;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
+import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
+import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode;
+import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.pdmodel.common.PDNameTreeNode;
+import org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification;
+import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile;
+import org.apache.pdfbox.pdmodel.interactive.action.PDAction;
+import org.apache.pdfbox.pdmodel.interactive.action.PDActionURI;
+import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
+import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationFileAttachment;
+import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink;
+import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationMarkup;
+import org.apache.pdfbox.pdmodel.interactive.digitalsignature.PDSignature;
+import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
+import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
+import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineNode;
+import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm;
+import org.apache.pdfbox.pdmodel.interactive.form.PDField;
+import org.apache.pdfbox.pdmodel.interactive.form.PDNonTerminalField;
+import org.apache.pdfbox.pdmodel.interactive.form.PDSignatureField;
+import org.apache.pdfbox.pdmodel.interactive.form.PDXFAResource;
+import org.apache.pdfbox.rendering.PDFRenderer;
+import org.apache.pdfbox.text.PDFTextStripper;
+import org.apache.pdfbox.tools.imageio.ImageIOUtil;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.ocr.TesseractOCRConfig;
+import org.apache.tika.parser.ocr.TesseractOCRParser;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+import static org.apache.tika.parser.pdf.PDFParserConfig.OCR_STRATEGY.NO_OCR;
+
+class AbstractPDF2XHTML extends PDFTextStripper {
+
+ /**
+ * Maximum recursive depth during AcroForm processing.
+ * Prevents theoretical AcroForm recursion bomb.
+ */
+ private final static int MAX_ACROFORM_RECURSIONS = 10;
+
+ private final static TesseractOCRConfig DEFAULT_TESSERACT_CONFIG = new TesseractOCRConfig();
+
+ /**
+ * Format used for signature dates
+ * TODO Make this thread-safe
+ */
+ private final SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.ROOT);
+
+
+ final List<IOException> exceptions = new ArrayList<>();
+ final PDDocument pdDocument;
+ final XHTMLContentHandler xhtml;
+ private final ParseContext context;
+ private final Metadata metadata;
+ final PDFParserConfig config;
+
+ private int pageIndex = 0;
+
+ AbstractPDF2XHTML(PDDocument pdDocument, ContentHandler handler, ParseContext context, Metadata metadata,
+ PDFParserConfig config) throws IOException {
+ this.pdDocument = pdDocument;
+ this.xhtml = new XHTMLContentHandler(handler, metadata);
+ this.context = context;
+ this.metadata = metadata;
+ this.config = config;
+ }
+
+ @Override
+ protected void startPage(PDPage page) throws IOException {
+ try {
+ xhtml.startElement("div", "class", "page");
+ } catch (SAXException e) {
+ throw new IOExceptionWithCause("Unable to start a page", e);
+ }
+ writeParagraphStart();
+ }
+
+ EmbeddedDocumentExtractor getEmbeddedDocumentExtractor() {
+ EmbeddedDocumentExtractor extractor =
+ context.get(EmbeddedDocumentExtractor.class);
+ if (extractor == null) {
+ extractor = new ParsingEmbeddedDocumentExtractor(context);
+ }
+ return extractor;
+ }
+
+ private void extractEmbeddedDocuments(PDDocument document)
+ throws IOException, SAXException, TikaException {
+ PDDocumentNameDictionary namesDictionary =
+ new PDDocumentNameDictionary(document.getDocumentCatalog());
+ PDEmbeddedFilesNameTreeNode efTree = namesDictionary.getEmbeddedFiles();
+ if (efTree == null) {
+ return;
+ }
+
+ Map<String, PDComplexFileSpecification> embeddedFileNames = efTree.getNames();
+ //For now, try to get the embeddedFileNames out of embeddedFiles or its kids.
+ //This code follows: pdfbox/examples/pdmodel/ExtractEmbeddedFiles.java
+ //If there is a need we could add a fully recursive search to find a non-null
+ //Map<String, COSObjectable> that contains the doc info.
+ if (embeddedFileNames != null) {
+ processEmbeddedDocNames(embeddedFileNames);
+ } else {
+ List<PDNameTreeNode<PDComplexFileSpecification>> kids = efTree.getKids();
+ if (kids == null) {
+ return;
+ }
+ for (PDNameTreeNode<PDComplexFileSpecification> node : kids) {
+ embeddedFileNames = node.getNames();
+ if (embeddedFileNames != null) {
+ processEmbeddedDocNames(embeddedFileNames);
+ }
+ }
+ }
+ }
+
+ private void processEmbeddedDocNames(Map<String, PDComplexFileSpecification> embeddedFileNames)
+ throws IOException, SAXException, TikaException {
+ if (embeddedFileNames == null || embeddedFileNames.isEmpty()) {
+ return;
+ }
+
+ EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor();
+ for (Map.Entry<String, PDComplexFileSpecification> ent : embeddedFileNames.entrySet()) {
+ PDComplexFileSpecification spec = ent.getValue();
+ extractMultiOSPDEmbeddedFiles(ent.getKey(), spec, extractor);
+ }
+ }
+
+ private void extractMultiOSPDEmbeddedFiles(String defaultName,
+ PDComplexFileSpecification spec,
+ EmbeddedDocumentExtractor extractor) throws IOException,
+ SAXException, TikaException {
+
+ if (spec == null) {
+ return;
+ }
+ //current strategy is to pull all, not just first non-null
+ extractPDEmbeddedFile(defaultName, spec.getFile(), spec.getEmbeddedFile(), extractor);
+ extractPDEmbeddedFile(defaultName, spec.getFileMac(), spec.getEmbeddedFileMac(), extractor);
+ extractPDEmbeddedFile(defaultName, spec.getFileDos(), spec.getEmbeddedFileDos(), extractor);
+ extractPDEmbeddedFile(defaultName, spec.getFileUnix(), spec.getEmbeddedFileUnix(), extractor);
+ }
+
+ private void extractPDEmbeddedFile(String defaultName, String fileName, PDEmbeddedFile file,
+ EmbeddedDocumentExtractor extractor)
+ throws SAXException, IOException, TikaException {
+
+ if (file == null) {
+ //skip silently
+ return;
+ }
+
+ fileName = (fileName == null) ? defaultName : fileName;
+
+ // TODO: other metadata?
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.RESOURCE_NAME_KEY, fileName);
+ metadata.set(Metadata.CONTENT_TYPE, file.getSubtype());
+ metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize()));
+ metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+ TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString());
+
+ if (extractor.shouldParseEmbedded(metadata)) {
+ TikaInputStream stream = null;
+ try {
+ stream = TikaInputStream.get(file.createInputStream());
+ extractor.parseEmbedded(
+ stream,
+ new EmbeddedContentHandler(xhtml),
+ metadata, false);
+
+ AttributesImpl attributes = new AttributesImpl();
+ attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+ attributes.addAttribute("", "id", "id", "CDATA", fileName);
+ xhtml.startElement("div", attributes);
+ xhtml.endElement("div");
+ } finally {
+ IOUtils.closeQuietly(stream);
+ }
+ }
+ }
+
+ void handleCatchableIOE(IOException e) throws IOException {
+ if (config.isCatchIntermediateIOExceptions()) {
+ String msg = e.getMessage();
+ if (msg == null) {
+ msg = "IOException, no message";
+ }
+ metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, msg);
+ exceptions.add(e);
+ } else {
+ throw e;
+ }
+ }
+
+ void doOCROnCurrentPage() throws IOException, TikaException, SAXException {
+ if (config.getOCRStrategy().equals(NO_OCR)) {
+ return;
+ }
+ TesseractOCRConfig tesseractConfig =
+ context.get(TesseractOCRConfig.class, DEFAULT_TESSERACT_CONFIG);
+
+ TesseractOCRParser tesseractOCRParser = new TesseractOCRParser();
+ if (! tesseractOCRParser.hasTesseract(tesseractConfig)) {
+ throw new TikaException("Tesseract is not available. "+
+ "Please set the OCR_STRATEGY to NO_OCR or configure Tesseract correctly");
+ }
+
+ PDFRenderer renderer = new PDFRenderer(pdDocument);
+ TemporaryResources tmp = new TemporaryResources();
+ try {
+ BufferedImage image = renderer.renderImage(pageIndex, 2.0f, config.getOCRImageType());
+ Path tmpFile = tmp.createTempFile();
+ try (OutputStream os = Files.newOutputStream(tmpFile)) {
+ //TODO: get output format from TesseractConfig
+ ImageIOUtil.writeImage(image, config.getOCRImageFormatName(),
+ os, config.getOCRDPI());
+ }
+ try (InputStream is = TikaInputStream.get(tmpFile)) {
+ tesseractOCRParser.parseInline(is, xhtml, tesseractConfig);
+ }
+ } catch (IOException e) {
+ handleCatchableIOE(e);
+ } catch (SAXException e) {
+ throw new IOExceptionWithCause("error writing OCR content from PDF", e);
+ } finally {
+ tmp.dispose();
+ }
+ }
+
+ @Override
+ protected void endPage(PDPage page) throws IOException {
+
+ try {
+ EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor();
+ for (PDAnnotation annotation : page.getAnnotations()) {
+
+ if (annotation instanceof PDAnnotationFileAttachment) {
+ PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation;
+ PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile();
+ try {
+ extractMultiOSPDEmbeddedFiles("", fileSpec, extractor);
+ } catch (SAXException e) {
+ throw new IOExceptionWithCause("file embedded in annotation sax exception", e);
+ } catch (TikaException e) {
+ throw new IOExceptionWithCause("file embedded in annotation tika exception", e);
+ } catch (IOException e) {
+ handleCatchableIOE(e);
+ }
+ }
+ // TODO: remove once PDFBOX-1143 is fixed:
+ if (config.getExtractAnnotationText()) {
+ if (annotation instanceof PDAnnotationLink) {
+ PDAnnotationLink annotationlink = (PDAnnotationLink) annotation;
+ if (annotationlink.getAction() != null) {
+ PDAction action = annotationlink.getAction();
+ if (action instanceof PDActionURI) {
+ PDActionURI uri = (PDActionURI) action;
+ String link = uri.getURI();
+ if (link != null) {
+ xhtml.startElement("div", "class", "annotation");
+ xhtml.startElement("a", "href", link);
+ xhtml.endElement("a");
+ xhtml.endElement("div");
+ }
+ }
+ }
+ }
+
+ if (annotation instanceof PDAnnotationMarkup) {
+ PDAnnotationMarkup annotationMarkup = (PDAnnotationMarkup) annotation;
+ String title = annotationMarkup.getTitlePopup();
+ String subject = annotationMarkup.getSubject();
+ String contents = annotationMarkup.getContents();
+ // TODO: maybe also annotationMarkup.getRichContents()?
+ if (title != null || subject != null || contents != null) {
+ xhtml.startElement("div", "class", "annotation");
+
+ if (title != null) {
+ xhtml.startElement("div", "class", "annotationTitle");
+ xhtml.characters(title);
+ xhtml.endElement("div");
+ }
+
+ if (subject != null) {
+ xhtml.startElement("div", "class", "annotationSubject");
+ xhtml.characters(subject);
+ xhtml.endElement("div");
+ }
+
+ if (contents != null) {
+ xhtml.startElement("div", "class", "annotationContents");
+ xhtml.characters(contents);
+ xhtml.endElement("div");
+ }
+
+ xhtml.endElement("div");
+ }
+ }
+ }
+ }
+ if (config.getOCRStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION)) {
+ doOCROnCurrentPage();
+ }
+ xhtml.endElement("div");
+ } catch (SAXException|TikaException e) {
+ throw new IOExceptionWithCause("Unable to end a page", e);
+ } catch (IOException e) {
+ exceptions.add(e);
+ } finally {
+ pageIndex++;
+ }
+ }
+
+ @Override
+ protected void startDocument(PDDocument pdf) throws IOException {
+ try {
+ xhtml.startDocument();
+ } catch (SAXException e) {
+ throw new IOExceptionWithCause("Unable to start a document", e);
+ }
+ }
+
+ @Override
+ protected void endDocument(PDDocument pdf) throws IOException {
+ try {
+ // Extract text for any bookmarks:
+ extractBookmarkText();
+ try {
+ extractEmbeddedDocuments(pdf);
+ } catch (IOException e) {
+ handleCatchableIOE(e);
+ }
+
+ //extract acroform data at end of doc
+ if (config.getExtractAcroFormContent() == true) {
+ try {
+ extractAcroForm(pdf);
+ } catch (IOException e) {
+ handleCatchableIOE(e);
+ }
+ }
+ xhtml.endDocument();
+ } catch (TikaException e) {
+ throw new IOExceptionWithCause("Unable to end a document", e);
+ } catch (SAXException e) {
+ throw new IOExceptionWithCause("Unable to end a document", e);
+ }
+ }
+
+ void extractBookmarkText() throws SAXException {
+ PDDocumentOutline outline = document.getDocumentCatalog().getDocumentOutline();
+ if (outline != null) {
+ extractBookmarkText(outline);
+ }
+ }
+
+ void extractBookmarkText(PDOutlineNode bookmark) throws SAXException {
+ PDOutlineItem current = bookmark.getFirstChild();
+ if (current != null) {
+ xhtml.startElement("ul");
+ while (current != null) {
+ xhtml.startElement("li");
+ xhtml.characters(current.getTitle());
+ xhtml.endElement("li");
+ // Recurse:
+ extractBookmarkText(current);
+ current = current.getNextSibling();
+ }
+ xhtml.endElement("ul");
+ }
+ }
+
+ void extractAcroForm(PDDocument pdf) throws IOException,
+ SAXException {
+ //Thank you, Ben Litchfield, for org.apache.pdfbox.examples.fdf.PrintFields
+ //this code derives from Ben's code
+ PDDocumentCatalog catalog = pdf.getDocumentCatalog();
+
+ if (catalog == null)
+ return;
+
+ PDAcroForm form = catalog.getAcroForm();
+ if (form == null)
+ return;
+
+ //if it has xfa, try that.
+ //if it doesn't exist or there's an exception,
+ //go with traditional AcroForm
+ PDXFAResource pdxfa = form.getXFA();
+
+ if (pdxfa != null) {
+ //if successful, return
+ XFAExtractor xfaExtractor = new XFAExtractor();
+ try (InputStream is = new BufferedInputStream(
+ new ByteArrayInputStream(pdxfa.getBytes()))) {
+ xfaExtractor.extract(is, xhtml, metadata, context);
+ return;
+ } catch (XMLStreamException |IOException e) {
+ //if there was an xml parse exception in xfa, try the AcroForm
+ }
+ }
+
+ @SuppressWarnings("rawtypes")
+ List fields = form.getFields();
+
+ if (fields == null)
+ return;
+
+ @SuppressWarnings("rawtypes")
+ ListIterator itr = fields.listIterator();
+
+ if (itr == null)
+ return;
+
+ xhtml.startElement("div", "class", "acroform");
+ xhtml.startElement("ol");
+
+ while (itr.hasNext()) {
+ Object obj = itr.next();
+ if (obj != null && obj instanceof PDField) {
+ processAcroField((PDField) obj, 0);
+ }
+ }
+ xhtml.endElement("ol");
+ xhtml.endElement("div");
+ }
+
+ private void processAcroField(PDField field, final int currentRecursiveDepth)
+ throws SAXException, IOException {
+
+ if (currentRecursiveDepth >= MAX_ACROFORM_RECURSIONS) {
+ return;
+ }
+ addFieldString(field);
+ if (field instanceof PDNonTerminalField) {
+ int r = currentRecursiveDepth + 1;
+ xhtml.startElement("ol");
+ for (PDField child : ((PDNonTerminalField)field).getChildren()) {
+ processAcroField(child, r);
+ }
+ xhtml.endElement("ol");
+ }
+ }
+
+ private void addFieldString(PDField field) throws SAXException {
+ //Pick partial name to present in content and altName for attribute
+ //Ignoring FullyQualifiedName for now
+ String partName = field.getPartialName();
+ String altName = field.getAlternateFieldName();
+
+ StringBuilder sb = new StringBuilder();
+ AttributesImpl attrs = new AttributesImpl();
+
+ if (partName != null) {
+ sb.append(partName).append(": ");
+ }
+ if (altName != null) {
+ attrs.addAttribute("", "altName", "altName", "CDATA", altName);
+ }
+ //return early if PDSignature field
+ if (field instanceof PDSignatureField) {
+ handleSignature(attrs, (PDSignatureField) field);
+ return;
+ }
+ String value = field.getValueAsString();
+ if (value != null && !value.equals("null")) {
+ sb.append(value);
+ }
+
+ if (attrs.getLength() > 0 || sb.length() > 0) {
+ xhtml.startElement("li", attrs);
+ xhtml.characters(sb.toString());
+ xhtml.endElement("li");
+ }
+ }
+
+ private void handleSignature(AttributesImpl parentAttributes, PDSignatureField sigField)
+ throws SAXException {
+
+ PDSignature sig = sigField.getSignature();
+ if (sig == null) {
+ return;
+ }
+ Map<String, String> vals = new TreeMap<>();
+ vals.put("name", sig.getName());
+ vals.put("contactInfo", sig.getContactInfo());
+ vals.put("location", sig.getLocation());
+ vals.put("reason", sig.getReason());
+
+ Calendar cal = sig.getSignDate();
+ if (cal != null) {
+ dateFormat.setTimeZone(cal.getTimeZone());
+ vals.put("date", dateFormat.format(cal.getTime()));
+ }
+ //see if there is any data
+ int nonNull = 0;
+ for (String val : vals.keySet()) {
+ if (val != null && !val.equals("")) {
+ nonNull++;
+ }
+ }
+ //if there is, process it
+ if (nonNull > 0) {
+ xhtml.startElement("li", parentAttributes);
+
+ AttributesImpl attrs = new AttributesImpl();
+ attrs.addAttribute("", "type", "type", "CDATA", "signaturedata");
+
+ xhtml.startElement("ol", attrs);
+ for (Map.Entry<String, String> e : vals.entrySet()) {
+ if (e.getValue() == null || e.getValue().equals("")) {
+ continue;
+ }
+ attrs = new AttributesImpl();
+ attrs.addAttribute("", "signdata", "signdata", "CDATA", e.getKey());
+ xhtml.startElement("li", attrs);
+ xhtml.characters(e.getValue());
+ xhtml.endElement("li");
+ }
+ xhtml.endElement("ol");
+ xhtml.endElement("li");
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/ebe70289/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java
new file mode 100644
index 0000000..3ad551d
--- /dev/null
+++ b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java
@@ -0,0 +1,125 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pdf;
+
+import java.io.IOException;
+import java.io.Writer;
+
+import org.apache.commons.io.IOExceptionWithCause;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.text.PDFTextStripper;
+import org.apache.pdfbox.text.TextPosition;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+
+/**
+ * Utility class that overrides the {@link PDFTextStripper} functionality
+ * to integrate text extraction via OCR only.
+ *
+ */
+class OCR2XHTML extends AbstractPDF2XHTML {
+
+ private OCR2XHTML(PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata,
+ PDFParserConfig config)
+ throws IOException {
+ super(document, handler, context, metadata, config);
+ }
+
+ /**
+ * Converts the given PDF document (and related metadata) to a stream
+ * of XHTML SAX events sent to the given content handler.
+ *
+ * @param document PDF document
+ * @param handler SAX content handler
+ * @param metadata PDF metadata
+ * @throws SAXException if the content handler fails to process SAX events
+ * @throws TikaException if there was an exception outside of per page processing
+ */
+ public static void process(
+ PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata,
+ PDFParserConfig config)
+ throws SAXException, TikaException {
+ OCR2XHTML ocr2XHTML = null;
+ try {
+ ocr2XHTML = new OCR2XHTML(document, handler, context, metadata, config);
+ ocr2XHTML.writeText(document, new Writer() {
+ @Override
+ public void write(char[] cbuf, int off, int len) {
+ }
+
+ @Override
+ public void flush() {
+ }
+
+ @Override
+ public void close() {
+ }
+ });
+ } catch (IOException e) {
+ if (e.getCause() instanceof SAXException) {
+ throw (SAXException) e.getCause();
+ } else {
+ throw new TikaException("Unable to extract PDF content", e);
+ }
+ }
+ if (ocr2XHTML.exceptions.size() > 0) {
+ //throw the first
+ throw new TikaException("Unable to extract all PDF content",
+ ocr2XHTML.exceptions.get(0));
+ }
+ }
+
+ @Override
+ public void processPage(PDPage pdPage) throws IOException {
+ try {
+ startPage(pdPage);
+ doOCROnCurrentPage();
+ endPage(pdPage);
+ } catch (TikaException |SAXException e) {
+ throw new IOExceptionWithCause(e);
+ } catch (IOException e) {
+ handleCatchableIOE(e);
+ }
+ }
+
+ @Override
+ protected void writeString(String text) throws IOException {
+ //no-op
+ }
+
+ @Override
+ protected void writeCharacters(TextPosition text) throws IOException {
+ //no-op
+ }
+
+ @Override
+ protected void writeWordSeparator() throws IOException {
+ //no-op
+ }
+
+ @Override
+ protected void writeLineSeparator() throws IOException {
+ //no-op
+ }
+
+}
+