You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2017/01/11 14:04:24 UTC

tika git commit: TIKA-2232 -- add processing of jbig2 (with necessary non ASL 2.0 libs) via Pascal Essiembre

Repository: tika
Updated Branches:
  refs/heads/2.x c14e75070 -> 0bc9bd896


TIKA-2232 -- add processing of jbig2 (with necessary non ASL 2.0 libs) via Pascal Essiembre


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/0bc9bd89
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/0bc9bd89
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/0bc9bd89

Branch: refs/heads/2.x
Commit: 0bc9bd89675d866b6ccd9e8b9e04ecfed8988544
Parents: c14e750
Author: tballison <ta...@mitre.org>
Authored: Wed Jan 11 09:04:17 2017 -0500
Committer: tballison <ta...@mitre.org>
Committed: Wed Jan 11 09:04:17 2017 -0500

----------------------------------------------------------------------
 CHANGES.txt                                     |   4 ++
 .../org/apache/tika/mime/tika-mimetypes.xml     |  15 ++++++-
 .../tika-parser-multimedia-module/pom.xml       |  11 +++++
 .../apache/tika/parser/image/ImageParser.java   |   6 ++-
 .../org/apache/tika/parser/pdf/PDF2XHTML.java   |  16 ++++++-
 .../tika/parser/image/ImageParserTest.java      |  12 ++++-
 .../apache/tika/parser/pdf/PDFParserTest.java   |  45 +++++++++++++++++++
 .../test/resources/test-documents/testJBIG2.jb2 | Bin 0 -> 346 bytes
 .../resources/test-documents/testPDF_JBIG2.pdf  | Bin 0 -> 23945 bytes
 9 files changed, 105 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/0bc9bd89/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index 283aa24..ad5b835 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -17,6 +17,10 @@ Release 2.0 - ???
 
 Release 1.15 -???
 
+  * Add parsing of JBIG2 and extraction of JBIG2 from PDFs when
+    required dependencies are added to class path by user.
+    Contributed by Pascal Essiembre (TIKA-2232).
+
   * Mime magic for the OneNote family (.one / .onetoc / .onepkg), no parser
     (TIKA-2224).
 

http://git-wip-us.apache.org/repos/asf/tika/blob/0bc9bd89/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
----------------------------------------------------------------------
diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index 75edf3b..1afdc0e 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -5113,7 +5113,20 @@
     <glob pattern="*.ft11"/>
     <glob pattern="*.ft12"/>
   </mime-type>
-
+  <mime-type type="image/x-jbig2">
+    <alias type="image/x-jb2"/>
+    <acronym>JBIG2</acronym>
+    <_comment>
+      A lossless image compression standard from the
+      Joint Bi-level Image Experts Group.
+    </_comment>
+    <tika:link>http://www.itu.int/rec/T-REC-T.88/en</tika:link>
+    <magic priority="50">
+      <match value="0x974A42320D0A1A0A" type="string" offset="0"/>
+    </magic>
+    <glob pattern="*.jb2"/>
+    <glob pattern="*.jbig2"/>
+  </mime-type>
   <mime-type type="image/x-jp2-codestream">
     <_comment>JPEG 2000 Codestream</_comment>
     <magic priority="25">

http://git-wip-us.apache.org/repos/asf/tika/blob/0bc9bd89/tika-parser-modules/tika-parser-multimedia-module/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/pom.xml b/tika-parser-modules/tika-parser-multimedia-module/pom.xml
index e0ffec8..5ee33da 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/pom.xml
+++ b/tika-parser-modules/tika-parser-multimedia-module/pom.xml
@@ -152,6 +152,17 @@
       <version>1.3.0</version>
       <scope>test</scope>
     </dependency>
+    <!-- Java ImageIO plugin for JBIG2 support (often used in PDF)
+         This jbig2 dep is not distributed with Tika due to licensing
+           issue (GPLV3). That's why it is included here as "test".
+           https://github.com/levigo/jbig2-imageio
+        -->
+    <dependency>
+      <groupId>com.levigo.jbig2</groupId>
+      <artifactId>levigo-jbig2-imageio</artifactId>
+      <version>1.6.5</version>
+      <scope>test</scope>
+    </dependency>
   </dependencies>
   
   <build>

http://git-wip-us.apache.org/repos/asf/tika/blob/0bc9bd89/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/ImageParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/ImageParser.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/ImageParser.java
index 8fd23eb..af96cda 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/ImageParser.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/image/ImageParser.java
@@ -61,7 +61,8 @@ public class ImageParser extends AbstractParser {
                     MediaType.image("png"),
                     MediaType.image("vnd.wap.wbmp"),
                     MediaType.image("x-icon"),
-                    MediaType.image("x-xcf"))));
+                    MediaType.image("x-xcf"),
+                    MediaType.image("x-jbig2"))));
 
     private static void setIfPresent(Metadata metadata, String imageIOkey, String tikaKey) {
         if (metadata.get(imageIOkey) != null) {
@@ -80,6 +81,9 @@ public class ImageParser extends AbstractParser {
     }
 
     private static void loadMetadata(IIOMetadata imageMetadata, Metadata metadata) {
+        if (imageMetadata == null) {
+            return;
+        }
         String[] names = imageMetadata.getMetadataFormatNames();
         if (names == null) {
             return;

http://git-wip-us.apache.org/repos/asf/tika/blob/0bc9bd89/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
index b416a61..a06d46a 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
@@ -211,7 +211,21 @@ class PDF2XHTML extends AbstractPDF2XHTML {
                     extension = "tif";
                 } else if (extension.equals("jpx")) {
                     embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/jp2");
-                } else {
+                }
+
+                // PDFBox does not yet return JBIG2 extension and extracting
+                // inline JBIG2 images fails with test file testPDF_JBIG2.pdf
+                // if we explicitely set the content type to image/x-jbig2
+                // (no "pages" are found when image is embedded).
+                // It works when it thinks it is PNG so we do not force it to
+                // jb2 for parsing until this issue is addressed in PDFBox and
+                // Levigo jbig2-imageio.  Will result in bad content-type in
+                // metadata for now, but that's better than not being able to
+                // handle JBIG2 in PDFs at all.
+                //                } else if (extension.equals("jb2")) {
+                //                    embeddedMetadata.set(
+                //                            Metadata.CONTENT_TYPE, "image/x-jbig2");
+                else {
                     //TODO: determine if we need to add more image types
                     //throw new RuntimeException("EXTEN:" + extension);
                 }

http://git-wip-us.apache.org/repos/asf/tika/blob/0bc9bd89/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/image/ImageParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/image/ImageParserTest.java b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/image/ImageParserTest.java
index 83d72c9..abf173f 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/image/ImageParserTest.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/image/ImageParserTest.java
@@ -20,6 +20,7 @@ import static org.junit.Assert.assertEquals;
 
 import java.io.InputStream;
 
+import org.apache.tika.TikaTest;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.ParseContext;
@@ -27,7 +28,7 @@ import org.apache.tika.parser.Parser;
 import org.junit.Test;
 import org.xml.sax.helpers.DefaultHandler;
 
-public class ImageParserTest {
+public class ImageParserTest extends TikaTest {
 
     private final Parser parser = new ImageParser();
 
@@ -159,4 +160,13 @@ public class ImageParserTest {
         assertEquals("8 8 8", metadata.get(Metadata.BITS_PER_SAMPLE));
     }
 
+    @Test // TIKA-2232
+    public void testJBIG2() throws Exception {
+
+        XMLResult r = getXML("testJBIG2.jb2");
+        assertEquals("78", r.metadata.get("height"));
+        assertEquals("328", r.metadata.get("width"));
+        assertEquals("image/x-jbig2", r.metadata.get("Content-Type"));
+    }
+
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/0bc9bd89/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index f76aea7..9d92971 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -642,6 +642,51 @@ public class PDFParserTest extends TikaTest {
         assertEquals(TYPE_DOC.toString(), metadatas.get(4).get(Metadata.CONTENT_TYPE));
     }
 
+    @Test // TIKA-2232
+    public void testEmbeddedJBIG2Image() throws Exception {
+
+        ParseContext context = new ParseContext();
+        PDFParserConfig config = new PDFParserConfig();
+        config.setExtractInlineImages(true);
+        config.setExtractUniqueInlineImagesOnly(false);
+        context.set(PDFParserConfig.class, config);
+
+        List<Metadata> metadatas = getRecursiveMetadata("testPDF_JBIG2.pdf", context);
+
+        assertContains("test images compressed using JBIG2",
+                metadatas.get(0).get(RecursiveParserWrapper.TIKA_CONTENT));
+
+        assertEquals(2, metadatas.size());
+        assertNull("Exception found: " + metadatas.get(0).get(
+                "X-TIKA:EXCEPTION:warn"), metadatas.get(0).get(
+                "X-TIKA:EXCEPTION:warn"));
+        assertEquals("Invalid height.", "91", metadatas.get(1).get("height"));
+        assertEquals("Invalid width.", "352", metadatas.get(1).get("width"));
+
+        assertNull(metadatas.get(0).get(Metadata.RESOURCE_NAME_KEY));
+
+        //TODO mime/extension should be tested against JBIG2 once better
+        //supported by PDFBox and Levigo jbig2-imageio
+        assertEquals("image0.png",
+                metadatas.get(1).get(Metadata.RESOURCE_NAME_KEY));
+        assertEquals(MediaType.image("png").toString(),
+                metadatas.get(1).get(Metadata.CONTENT_TYPE));
+    }
+
+    @Test
+    public void testJBIG2OCROnly() throws Exception {
+        if (!canRunOCR()) {
+            return;
+        }
+        PDFParserConfig config = new PDFParserConfig();
+        config.setOCRStrategy(PDFParserConfig.OCR_STRATEGY.OCR_ONLY);
+        ParseContext context = new ParseContext();
+        context.set(PDFParserConfig.class, config);
+        context.set(Parser.class, new AutoDetectParser());
+        //make sure everything works with regular xml _and_ with recursive
+        XMLResult xmlResult = getXML("testPDF_JBIG2.pdf", context);
+        assertContains("Norconex", xmlResult.xml);
+    }
 
     @Test
     public void testEmbeddedFilesInAnnotations() throws Exception {

http://git-wip-us.apache.org/repos/asf/tika/blob/0bc9bd89/tika-test-resources/src/test/resources/test-documents/testJBIG2.jb2
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/testJBIG2.jb2 b/tika-test-resources/src/test/resources/test-documents/testJBIG2.jb2
new file mode 100644
index 0000000..8a6756f
Binary files /dev/null and b/tika-test-resources/src/test/resources/test-documents/testJBIG2.jb2 differ

http://git-wip-us.apache.org/repos/asf/tika/blob/0bc9bd89/tika-test-resources/src/test/resources/test-documents/testPDF_JBIG2.pdf
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/testPDF_JBIG2.pdf b/tika-test-resources/src/test/resources/test-documents/testPDF_JBIG2.pdf
new file mode 100644
index 0000000..08c80d7
Binary files /dev/null and b/tika-test-resources/src/test/resources/test-documents/testPDF_JBIG2.pdf differ