You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2017/09/01 14:51:36 UTC

[tika] branch master updated: TIKA-2451 - Extract number of tiffs in a multi-page tiff (TIKA-2451); many thanks to Mike Cantrell for supplying a test file.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/master by this push:
     new 8000cfe  TIKA-2451 - Extract number of tiffs in a multi-page tiff (TIKA-2451); many thanks to Mike Cantrell for supplying a test file.
8000cfe is described below

commit 8000cfec02a0c872241a9bfe90c21675b9118054
Author: tballison <ta...@mitre.org>
AuthorDate: Fri Sep 1 10:51:24 2017 -0400

    TIKA-2451 - Extract number of tiffs in a multi-page tiff (TIKA-2451); many thanks to Mike Cantrell for supplying a test file.
---
 CHANGES.txt                                        |   4 +++-
 .../main/java/org/apache/tika/metadata/TIFF.java   |   3 +++
 .../tika/parser/image/ImageMetadataExtractor.java  |  24 +++++++++++++++++++++
 .../apache/tika/parser/image/TiffParserTest.java   |  13 ++++++++++-
 .../tika/parser/ocr/TesseractOCRParserTest.java    |   9 ++++++++
 .../test-documents/testTIFF_multipage.tif          | Bin 0 -> 156867 bytes
 6 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index b4e0e35..c99af63 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,6 +1,8 @@
 Release 1.17 - ???
 
-  * Fix detection of emails extracted from mbox (TIKA-2456)
+  * Extract number of tiffs in a multi-page tiff (TIKA-2451).
+
+  * Fix detection of emails extracted from mbox (TIKA-2456).
   
   * Add OverrideDetector and allow PSTParser to specify body content type
     as text or html -- to avoid incorrect auto-detection of
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TIFF.java b/tika-core/src/main/java/org/apache/tika/metadata/TIFF.java
index f4ecacc..f81f51b 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TIFF.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TIFF.java
@@ -139,4 +139,7 @@ public interface TIFF {
      */
     Property ORIGINAL_DATE =
        Property.internalDate("exif:DateTimeOriginal");
+
+    Property EXIF_PAGE_COUNT =
+            Property.externalInteger("exif:PageCount");
 }
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
index 0c2cf4d..64ddf73 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
@@ -55,6 +55,7 @@ import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.IPTC;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TIFF;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.image.xmp.JempboxExtractor;
@@ -80,6 +81,7 @@ public class ImageMetadataExtractor {
     public ImageMetadataExtractor(Metadata metadata) {
         this(metadata,
                 new CopyUnknownFieldsHandler(),
+                new TiffPageNumberHandler(),
                 new JpegCommentHandler(),
                 new ExifHandler(),
                 new DimensionsHandler(),
@@ -290,6 +292,28 @@ public class ImageMetadataExtractor {
         }
     }
 
+    static class TiffPageNumberHandler implements DirectoryHandler {
+        public boolean supports(Class<? extends Directory> directoryType) {
+            return true;
+        }
+
+        public void handle(Directory directory, Metadata metadata)
+                throws MetadataException {
+            //TODO: after upgrading metadataextractor, swap out
+            //magic number with ExifDirectoryBase.TAG_PAGE_NUMBER
+            if (directory.containsTag(297)) {
+                int[] pageNums = directory.getIntArray(297);
+                //pages can be in any order, take the max
+                if (pageNums != null && pageNums.length > 1) {
+                    Integer curr = metadata.getInt(TIFF.EXIF_PAGE_COUNT);
+                    if (curr == null || curr < pageNums[1]) {
+                        metadata.set(TIFF.EXIF_PAGE_COUNT, pageNums[1]);
+                    }
+                }
+            }
+        }
+    }
+
     /**
      * Basic image properties for TIFF and JPEG, at least.
      */
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/image/TiffParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/image/TiffParserTest.java
index d506c33..df6c2cb 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/image/TiffParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/image/TiffParserTest.java
@@ -23,14 +23,17 @@ import java.io.InputStream;
 import java.util.Arrays;
 import java.util.List;
 
+import org.apache.tika.TikaTest;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TIFF;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.junit.Test;
 import org.xml.sax.helpers.DefaultHandler;
 
-public class TiffParserTest {
+public class TiffParserTest extends TikaTest {
+
     private final Parser parser = new TiffParser();
 
     @Test
@@ -63,4 +66,12 @@ public class TiffParserTest {
         assertTrue("got " + subject, subject.contains("cat"));
         assertTrue("got " + subject, subject.contains("garden"));
     }
+
+    @Test
+    public void testPageCount() throws Exception {
+        assertEquals(2L,
+                (long)getXML("testTIFF_multipage.tif")
+                .metadata
+                .getInt(TIFF.EXIF_PAGE_COUNT));
+    }
 }
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
index 4c0ab76..63d9c96 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
@@ -278,4 +278,13 @@ public class TesseractOCRParserTest extends TikaTest {
         Matcher m = Pattern.compile("The\\s{5,20}quick").matcher(xml);
         assertTrue(m.find());
     }
+
+    @Test
+    public void confirmMultiPageTiffHandling() throws Exception {
+        assumeTrue(canRun());
+        //tesseract should handle multipage tiffs by itself
+        //let's confirm that
+        String xml = getXML("testTIFF_multipage.tif").xml;
+        assertContains("Page 2", xml);
+    }
 }
diff --git a/tika-parsers/src/test/resources/test-documents/testTIFF_multipage.tif b/tika-parsers/src/test/resources/test-documents/testTIFF_multipage.tif
new file mode 100644
index 0000000..00c1cae
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testTIFF_multipage.tif differ

-- 
To stop receiving notification emails like this one, please contact
['"commits@tika.apache.org" <co...@tika.apache.org>'].