You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2017/09/01 14:51:36 UTC
[tika] branch master updated: TIKA-2451 - Extract number of tiffs
in a multi-page tiff (TIKA-2451);
many thanks to Mike Cantrell for supplying a test file.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new 8000cfe TIKA-2451 - Extract number of tiffs in a multi-page tiff (TIKA-2451); many thanks to Mike Cantrell for supplying a test file.
8000cfe is described below
commit 8000cfec02a0c872241a9bfe90c21675b9118054
Author: tballison <ta...@mitre.org>
AuthorDate: Fri Sep 1 10:51:24 2017 -0400
TIKA-2451 - Extract number of tiffs in a multi-page tiff (TIKA-2451); many thanks to Mike Cantrell for supplying a test file.
---
CHANGES.txt | 4 +++-
.../main/java/org/apache/tika/metadata/TIFF.java | 3 +++
.../tika/parser/image/ImageMetadataExtractor.java | 24 +++++++++++++++++++++
.../apache/tika/parser/image/TiffParserTest.java | 13 ++++++++++-
.../tika/parser/ocr/TesseractOCRParserTest.java | 9 ++++++++
.../test-documents/testTIFF_multipage.tif | Bin 0 -> 156867 bytes
6 files changed, 51 insertions(+), 2 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index b4e0e35..c99af63 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,6 +1,8 @@
Release 1.17 - ???
- * Fix detection of emails extracted from mbox (TIKA-2456)
+ * Extract number of tiffs in a multi-page tiff (TIKA-2451).
+
+ * Fix detection of emails extracted from mbox (TIKA-2456).
* Add OverrideDetector and allow PSTParser to specify body content type
as text or html -- to avoid incorrect auto-detection of
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TIFF.java b/tika-core/src/main/java/org/apache/tika/metadata/TIFF.java
index f4ecacc..f81f51b 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TIFF.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TIFF.java
@@ -139,4 +139,7 @@ public interface TIFF {
*/
Property ORIGINAL_DATE =
Property.internalDate("exif:DateTimeOriginal");
+
+ Property EXIF_PAGE_COUNT =
+ Property.externalInteger("exif:PageCount");
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
index 0c2cf4d..64ddf73 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
@@ -55,6 +55,7 @@ import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.IPTC;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TIFF;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.image.xmp.JempboxExtractor;
@@ -80,6 +81,7 @@ public class ImageMetadataExtractor {
public ImageMetadataExtractor(Metadata metadata) {
this(metadata,
new CopyUnknownFieldsHandler(),
+ new TiffPageNumberHandler(),
new JpegCommentHandler(),
new ExifHandler(),
new DimensionsHandler(),
@@ -290,6 +292,28 @@ public class ImageMetadataExtractor {
}
}
+ static class TiffPageNumberHandler implements DirectoryHandler {
+ public boolean supports(Class<? extends Directory> directoryType) {
+ return true;
+ }
+
+ public void handle(Directory directory, Metadata metadata)
+ throws MetadataException {
+ //TODO: after upgrading metadataextractor, swap out
+ //magic number with ExifDirectoryBase.TAG_PAGE_NUMBER
+ if (directory.containsTag(297)) {
+ int[] pageNums = directory.getIntArray(297);
+ //pages can be in any order, take the max
+ if (pageNums != null && pageNums.length > 1) {
+ Integer curr = metadata.getInt(TIFF.EXIF_PAGE_COUNT);
+ if (curr == null || curr < pageNums[1]) {
+ metadata.set(TIFF.EXIF_PAGE_COUNT, pageNums[1]);
+ }
+ }
+ }
+ }
+ }
+
/**
* Basic image properties for TIFF and JPEG, at least.
*/
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/image/TiffParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/image/TiffParserTest.java
index d506c33..df6c2cb 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/image/TiffParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/image/TiffParserTest.java
@@ -23,14 +23,17 @@ import java.io.InputStream;
import java.util.Arrays;
import java.util.List;
+import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TIFF;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.junit.Test;
import org.xml.sax.helpers.DefaultHandler;
-public class TiffParserTest {
+public class TiffParserTest extends TikaTest {
+
private final Parser parser = new TiffParser();
@Test
@@ -63,4 +66,12 @@ public class TiffParserTest {
assertTrue("got " + subject, subject.contains("cat"));
assertTrue("got " + subject, subject.contains("garden"));
}
+
+ @Test
+ public void testPageCount() throws Exception {
+ assertEquals(2L,
+ (long)getXML("testTIFF_multipage.tif")
+ .metadata
+ .getInt(TIFF.EXIF_PAGE_COUNT));
+ }
}
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
index 4c0ab76..63d9c96 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
@@ -278,4 +278,13 @@ public class TesseractOCRParserTest extends TikaTest {
Matcher m = Pattern.compile("The\\s{5,20}quick").matcher(xml);
assertTrue(m.find());
}
+
+ @Test
+ public void confirmMultiPageTiffHandling() throws Exception {
+ assumeTrue(canRun());
+ //tesseract should handle multipage tiffs by itself
+ //let's confirm that
+ String xml = getXML("testTIFF_multipage.tif").xml;
+ assertContains("Page 2", xml);
+ }
}
diff --git a/tika-parsers/src/test/resources/test-documents/testTIFF_multipage.tif b/tika-parsers/src/test/resources/test-documents/testTIFF_multipage.tif
new file mode 100644
index 0000000..00c1cae
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testTIFF_multipage.tif differ
--
To stop receiving notification emails like this one, please contact
['"commits@tika.apache.org" <co...@tika.apache.org>'].