You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/11/10 04:01:36 UTC
tika git commit: TIKA-2175 -- add extraction for inline jp2/jpx from
PDFParser
Repository: tika
Updated Branches:
refs/heads/master 47ba703d6 -> 91cdce43d
TIKA-2175 -- add extraction for inline jp2/jpx from PDFParser
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/91cdce43
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/91cdce43
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/91cdce43
Branch: refs/heads/master
Commit: 91cdce43d22cd6726375a83c7842fa299035a258
Parents: 47ba703
Author: tballison <ta...@mitre.org>
Authored: Wed Nov 9 23:01:13 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Wed Nov 9 23:01:13 2016 -0500
----------------------------------------------------------------------
tika-parsers/pom.xml | 6 +++++
.../org/apache/tika/parser/pdf/PDF2XHTML.java | 24 ++++++++++++++++----
.../apache/tika/parser/pdf/PDFParserTest.java | 2 ++
3 files changed, 28 insertions(+), 4 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/91cdce43/tika-parsers/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml
index 31a727d..b7f4d38 100644
--- a/tika-parsers/pom.xml
+++ b/tika-parsers/pom.xml
@@ -334,6 +334,12 @@
<version>1.3.1</version>
<scope>test</scope>
</dependency>
+ <dependency>
+ <groupId>com.github.jai-imageio</groupId>
+ <artifactId>jai-imageio-jpeg2000</artifactId>
+ <version>1.3.0</version>
+ <scope>test</scope>
+ </dependency>
<!-- edu.ucar dependencies -->
<dependency>
<groupId>edu.ucar</groupId>
http://git-wip-us.apache.org/repos/asf/tika/blob/91cdce43/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
index 2a81103..d89dce4 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
@@ -34,6 +34,7 @@ import org.apache.commons.io.IOExceptionWithCause;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSStream;
+import org.apache.pdfbox.filter.MissingImageReaderException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
@@ -67,6 +68,9 @@ class PDF2XHTML extends AbstractPDF2XHTML {
COSName.DCT_DECODE.getName(),
COSName.DCT_DECODE_ABBREVIATION.getName());
+ private static final List<String> JP2 =
+ Arrays.asList(COSName.JPX_DECODE.getName());
+
/**
* This keeps track of the pdf object ids for inline
* images that have been processed.
@@ -168,7 +172,14 @@ class PDF2XHTML extends AbstractPDF2XHTML {
for (COSName name : resources.getXObjectNames()) {
- PDXObject object = resources.getXObject(name);
+ PDXObject object = null;
+ try {
+ object = resources.getXObject(name);
+ } catch (MissingImageReaderException e) {
+ EmbeddedDocumentUtil.recordException(e, metadata);
+ continue;
+ }
+
if (object == null) {
continue;
}
@@ -195,11 +206,12 @@ class PDF2XHTML extends AbstractPDF2XHTML {
} else if (extension.equals("tiff")) {
embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/tiff");
extension = "tif";
+ } else if (extension.equals("jpx")) {
+ embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/jp2");
} else {
//TODO: determine if we need to add more image types
- //throw new RuntimeException("EXTEN:" + extension);
+// throw new RuntimeException("EXTEN:" + extension);
}
-
Integer imageNumber = processedInlineImages.get(cosStream);
if (imageNumber == null) {
imageNumber = inlineImageCounter++;
@@ -268,7 +280,11 @@ class PDF2XHTML extends AbstractPDF2XHTML {
// for CMYK and other "unusual" colorspaces, the JPEG will be converted
ImageIOUtil.writeImage(image, suffix, out);
}
- } else {
+ } else if ("jp2".equals(suffix) || "jpx".equals(suffix)) {
+ InputStream data = pdImage.createInputStream(JP2);
+ org.apache.pdfbox.io.IOUtils.copy(data, out);
+ org.apache.pdfbox.io.IOUtils.closeQuietly(data);
+ } else{
ImageIOUtil.writeImage(image, suffix, out);
}
}
http://git-wip-us.apache.org/repos/asf/tika/blob/91cdce43/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 1f0f4d6..f29f544 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -1292,6 +1292,8 @@ public class PDFParserTest extends TikaTest {
}
}
+ //TODO: figure out how to test jp2 embedded with OCR
+
private void assertException(String path, Parser parser, ParseContext context, Class expected) {
boolean noEx = false;
InputStream is = getResourceAsStream(path);