You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2017/01/11 14:04:24 UTC
tika git commit: TIKA-2232 -- add processing of jbig2 (with necessary
non ASL 2.0 libs) via Pascal Essiembre
Repository: tika
Updated Branches:
refs/heads/2.x c14e75070 -> 0bc9bd896
TIKA-2232 -- add processing of jbig2 (with necessary non ASL 2.0 libs) via Pascal Essiembre
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/0bc9bd89
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/0bc9bd89
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/0bc9bd89
Branch: refs/heads/2.x
Commit: 0bc9bd89675d866b6ccd9e8b9e04ecfed8988544
Parents: c14e750
Author: tballison <ta...@mitre.org>
Authored: Wed Jan 11 09:04:17 2017 -0500
Committer: tballison <ta...@mitre.org>
Committed: Wed Jan 11 09:04:17 2017 -0500
----------------------------------------------------------------------
CHANGES.txt | 4 ++
.../org/apache/tika/mime/tika-mimetypes.xml | 15 ++++++-
.../tika-parser-multimedia-module/pom.xml | 11 +++++
.../apache/tika/parser/image/ImageParser.java | 6 ++-
.../org/apache/tika/parser/pdf/PDF2XHTML.java | 16 ++++++-
.../tika/parser/image/ImageParserTest.java | 12 ++++-
.../apache/tika/parser/pdf/PDFParserTest.java | 45 +++++++++++++++++++
.../test/resources/test-documents/testJBIG2.jb2 | Bin 0 -> 346 bytes
.../resources/test-documents/testPDF_JBIG2.pdf | Bin 0 -> 23945 bytes
9 files changed, 105 insertions(+), 4 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/0bc9bd89/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index 283aa24..ad5b835 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -17,6 +17,10 @@ Release 2.0 - ???
Release 1.15 -???
+ * Add parsing of JBIG2 and extraction of JBIG2 from PDFs when
+ required dependencies are added to class path by user.
+ Contributed by Pascal Essiembre (TIKA-2232).
+
* Mime magic for the OneNote family (.one / .onetoc / .onepkg), no parser
(TIKA-2224).
http://git-wip-us.apache.org/repos/asf/tika/blob/0bc9bd89/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
----------------------------------------------------------------------
diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index 75edf3b..1afdc0e 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -5113,7 +5113,20 @@
<glob pattern="*.ft11"/>
<glob pattern="*.ft12"/>
</mime-type>
-
+ <mime-type type="image/x-jbig2">
+ <alias type="image/x-jb2"/>
+ <acronym>JBIG2</acronym>
+ <_comment>
+ A lossless image compression standard from the
+ Joint Bi-level Image Experts Group.
+ </_comment>
+ <tika:link>http://www.itu.int/rec/T-REC-T.88/en</tika:link>
+ <magic priority="50">
+ <match value="0x974A42320D0A1A0A" type="string" offset="0"/>
+ </magic>
+ <glob pattern="*.jb2"/>
+ <glob pattern="*.jbig2"/>
+ </mime-type>
<mime-type type="image/x-jp2-codestream">
<_comment>JPEG 2000 Codestream</_comment>
<magic priority="25">
http://git-wip-us.apache.org/repos/asf/tika/blob/0bc9bd89/tika-parser-modules/tika-parser-multimedia-module/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/pom.xml b/tika-parser-modules/tika-parser-multimedia-module/pom.xml
index e0ffec8..5ee33da 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/pom.xml
+++ b/tika-parser-modules/tika-parser-multimedia-module/pom.xml
@@ -152,6 +152,17 @@
<version>1.3.0</version>
<scope>test</scope>
</dependency>
+ <!-- Java ImageIO plugin for JBIG2 support (often used in PDF)
+ This jbig2 dep is not distributed with Tika due to licensing
+ issue (GPLV3). That's why it is included here as "test".
+ https://github.com/levigo/jbig2-imageio
+ -->
+ <dependency>
+ <groupId>com.levigo.jbig2</groupId>
+ <artifactId>levigo-jbig2-imageio</artifactId>
+ <version>1.6.5</version>
+ <scope>test</scope>
+ </dependency>
</dependencies>
<build>
http://git-wip-us.apache.org/repos/asf/tika/blob/0bc9bd89/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/ImageParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/ImageParser.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/ImageParser.java
index 8fd23eb..af96cda 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/ImageParser.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/ImageParser.java
@@ -61,7 +61,8 @@ public class ImageParser extends AbstractParser {
MediaType.image("png"),
MediaType.image("vnd.wap.wbmp"),
MediaType.image("x-icon"),
- MediaType.image("x-xcf"))));
+ MediaType.image("x-xcf"),
+ MediaType.image("x-jbig2"))));
private static void setIfPresent(Metadata metadata, String imageIOkey, String tikaKey) {
if (metadata.get(imageIOkey) != null) {
@@ -80,6 +81,9 @@ public class ImageParser extends AbstractParser {
}
private static void loadMetadata(IIOMetadata imageMetadata, Metadata metadata) {
+ if (imageMetadata == null) {
+ return;
+ }
String[] names = imageMetadata.getMetadataFormatNames();
if (names == null) {
return;
http://git-wip-us.apache.org/repos/asf/tika/blob/0bc9bd89/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
index b416a61..a06d46a 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
@@ -211,7 +211,21 @@ class PDF2XHTML extends AbstractPDF2XHTML {
extension = "tif";
} else if (extension.equals("jpx")) {
embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/jp2");
- } else {
+ }
+
+ // PDFBox does not yet return JBIG2 extension and extracting
+ // inline JBIG2 images fails with test file testPDF_JBIG2.pdf
+ // if we explicitely set the content type to image/x-jbig2
+ // (no "pages" are found when image is embedded).
+ // It works when it thinks it is PNG so we do not force it to
+ // jb2 for parsing until this issue is addressed in PDFBox and
+ // Levigo jbig2-imageio. Will result in bad content-type in
+ // metadata for now, but that's better than not being able to
+ // handle JBIG2 in PDFs at all.
+ // } else if (extension.equals("jb2")) {
+ // embeddedMetadata.set(
+ // Metadata.CONTENT_TYPE, "image/x-jbig2");
+ else {
//TODO: determine if we need to add more image types
//throw new RuntimeException("EXTEN:" + extension);
}
http://git-wip-us.apache.org/repos/asf/tika/blob/0bc9bd89/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/image/ImageParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/image/ImageParserTest.java b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/image/ImageParserTest.java
index 83d72c9..abf173f 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/image/ImageParserTest.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/image/ImageParserTest.java
@@ -20,6 +20,7 @@ import static org.junit.Assert.assertEquals;
import java.io.InputStream;
+import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
@@ -27,7 +28,7 @@ import org.apache.tika.parser.Parser;
import org.junit.Test;
import org.xml.sax.helpers.DefaultHandler;
-public class ImageParserTest {
+public class ImageParserTest extends TikaTest {
private final Parser parser = new ImageParser();
@@ -159,4 +160,13 @@ public class ImageParserTest {
assertEquals("8 8 8", metadata.get(Metadata.BITS_PER_SAMPLE));
}
+ @Test // TIKA-2232
+ public void testJBIG2() throws Exception {
+
+ XMLResult r = getXML("testJBIG2.jb2");
+ assertEquals("78", r.metadata.get("height"));
+ assertEquals("328", r.metadata.get("width"));
+ assertEquals("image/x-jbig2", r.metadata.get("Content-Type"));
+ }
+
}
http://git-wip-us.apache.org/repos/asf/tika/blob/0bc9bd89/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index f76aea7..9d92971 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -642,6 +642,51 @@ public class PDFParserTest extends TikaTest {
assertEquals(TYPE_DOC.toString(), metadatas.get(4).get(Metadata.CONTENT_TYPE));
}
+ @Test // TIKA-2232
+ public void testEmbeddedJBIG2Image() throws Exception {
+
+ ParseContext context = new ParseContext();
+ PDFParserConfig config = new PDFParserConfig();
+ config.setExtractInlineImages(true);
+ config.setExtractUniqueInlineImagesOnly(false);
+ context.set(PDFParserConfig.class, config);
+
+ List<Metadata> metadatas = getRecursiveMetadata("testPDF_JBIG2.pdf", context);
+
+ assertContains("test images compressed using JBIG2",
+ metadatas.get(0).get(RecursiveParserWrapper.TIKA_CONTENT));
+
+ assertEquals(2, metadatas.size());
+ assertNull("Exception found: " + metadatas.get(0).get(
+ "X-TIKA:EXCEPTION:warn"), metadatas.get(0).get(
+ "X-TIKA:EXCEPTION:warn"));
+ assertEquals("Invalid height.", "91", metadatas.get(1).get("height"));
+ assertEquals("Invalid width.", "352", metadatas.get(1).get("width"));
+
+ assertNull(metadatas.get(0).get(Metadata.RESOURCE_NAME_KEY));
+
+ //TODO mime/extension should be tested against JBIG2 once better
+ //supported by PDFBox and Levigo jbig2-imageio
+ assertEquals("image0.png",
+ metadatas.get(1).get(Metadata.RESOURCE_NAME_KEY));
+ assertEquals(MediaType.image("png").toString(),
+ metadatas.get(1).get(Metadata.CONTENT_TYPE));
+ }
+
+ @Test
+ public void testJBIG2OCROnly() throws Exception {
+ if (!canRunOCR()) {
+ return;
+ }
+ PDFParserConfig config = new PDFParserConfig();
+ config.setOCRStrategy(PDFParserConfig.OCR_STRATEGY.OCR_ONLY);
+ ParseContext context = new ParseContext();
+ context.set(PDFParserConfig.class, config);
+ context.set(Parser.class, new AutoDetectParser());
+ //make sure everything works with regular xml _and_ with recursive
+ XMLResult xmlResult = getXML("testPDF_JBIG2.pdf", context);
+ assertContains("Norconex", xmlResult.xml);
+ }
@Test
public void testEmbeddedFilesInAnnotations() throws Exception {
http://git-wip-us.apache.org/repos/asf/tika/blob/0bc9bd89/tika-test-resources/src/test/resources/test-documents/testJBIG2.jb2
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/testJBIG2.jb2 b/tika-test-resources/src/test/resources/test-documents/testJBIG2.jb2
new file mode 100644
index 0000000..8a6756f
Binary files /dev/null and b/tika-test-resources/src/test/resources/test-documents/testJBIG2.jb2 differ
http://git-wip-us.apache.org/repos/asf/tika/blob/0bc9bd89/tika-test-resources/src/test/resources/test-documents/testPDF_JBIG2.pdf
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/testPDF_JBIG2.pdf b/tika-test-resources/src/test/resources/test-documents/testPDF_JBIG2.pdf
new file mode 100644
index 0000000..08c80d7
Binary files /dev/null and b/tika-test-resources/src/test/resources/test-documents/testPDF_JBIG2.pdf differ