You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2019/12/02 19:03:08 UTC

[tika] branch branch_1x updated: TIKA-2630: Wrong height and width metadata for JPEG images (#255)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/branch_1x by this push:
     new 90880e1  TIKA-2630: Wrong height and width metadata for JPEG images (#255)
90880e1 is described below

commit 90880e179f98c06ca948b8fed65e902f5b520e6b
Author: Dave Meikle <dm...@apache.org>
AuthorDate: Mon Dec 2 19:03:00 2019 +0000

    TIKA-2630: Wrong height and width metadata for JPEG images (#255)
    
    * TIKA-2630:
    - Added extraction of image height/width from ExifSubIFDDirectory for compressed images
    - Include directory name as key qualifier for Exif directories to avoid clashes
    
    * TIKA-2630: Tidied up code
---
 .../tika/parser/image/ImageMetadataExtractor.java  | 23 ++++++++++++++++++++--
 .../apache/tika/parser/jpeg/JpegParserTest.java    | 10 +++++-----
 .../tika/parser/ocr/TesseractOCRParserTest.java    |  2 +-
 .../org/apache/tika/parser/rtf/RTFParserTest.java  |  4 ++--
 4 files changed, 29 insertions(+), 10 deletions(-)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
index 18ab8f1..9fec322 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
@@ -260,7 +260,11 @@ public class ImageMetadataExtractor {
                 throws MetadataException {
             if (directory.getTags() != null) {
                 for (Tag tag : directory.getTags()) {
-                    metadata.set(tag.getTagName(), tag.getDescription());
+                    if (directory instanceof ExifDirectoryBase) {
+                        metadata.set(directory.getName() + ":" + tag.getTagName(), tag.getDescription());
+                    } else {
+                        metadata.set(tag.getTagName(), tag.getDescription());
+                    }
                 }
             }
         }
@@ -288,7 +292,11 @@ public class ImageMetadataExtractor {
                         } else if (Boolean.FALSE.toString().equalsIgnoreCase(value)) {
                             value = Boolean.FALSE.toString();
                         }
-                        metadata.set(name, value);
+                        if (directory instanceof ExifDirectoryBase) {
+                            metadata.set(directory.getName() + ":" + name, value);
+                        } else {
+                            metadata.set(name, value);
+                        }
                     }
                 }
             }
@@ -493,6 +501,17 @@ public class ImageMetadataExtractor {
                 metadata.set(Metadata.IMAGE_LENGTH,
                         trimPixels(directory.getDescription(ExifThumbnailDirectory.TAG_IMAGE_HEIGHT)));
             }
+
+            // For Compressed Images read from ExifSubIFDDirectory
+            if (directory.containsTag(ExifSubIFDDirectory.TAG_EXIF_IMAGE_WIDTH)) {
+                metadata.set(Metadata.IMAGE_WIDTH,
+                        trimPixels(directory.getDescription(ExifSubIFDDirectory.TAG_EXIF_IMAGE_WIDTH)));
+            }
+            if (directory.containsTag(ExifSubIFDDirectory.TAG_EXIF_IMAGE_WIDTH)) {
+                metadata.set(Metadata.IMAGE_LENGTH,
+                        trimPixels(directory.getDescription(ExifSubIFDDirectory.TAG_EXIF_IMAGE_HEIGHT)));
+            }
+
         }
 
         /**
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java
index d32dfc4..dd0d234 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java
@@ -65,8 +65,8 @@ public class JpegParserTest {
         parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
 
         // Core EXIF/TIFF tags
-        assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
-        assertEquals("68", metadata.get(Metadata.IMAGE_LENGTH));
+        assertEquals("3888", metadata.get(Metadata.IMAGE_WIDTH));
+        assertEquals("2592", metadata.get(Metadata.IMAGE_LENGTH));
         assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE));
         assertEquals(null, metadata.get(Metadata.SAMPLES_PER_PIXEL));
 
@@ -86,7 +86,7 @@ public class JpegParserTest {
         // Check that EXIF/TIFF tags come through with their raw values too
         // (This may be removed for Tika 1.0, as we support more of them
         //  with explicit Metadata entries)
-        assertEquals("Canon EOS 40D", metadata.get("Model"));
+        assertEquals("Canon EOS 40D", metadata.get("Exif IFD0:Model"));
 
         // Common tags
         assertEquals("2009-10-02T23:02:49", metadata.get(Metadata.LAST_MODIFIED));
@@ -120,8 +120,8 @@ public class JpegParserTest {
         assertEquals("-54.1234", metadata.get(Metadata.LONGITUDE));
 
         // Core EXIF/TIFF tags
-        assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
-        assertEquals("68", metadata.get(Metadata.IMAGE_LENGTH));
+        assertEquals("3888", metadata.get(Metadata.IMAGE_WIDTH));
+        assertEquals("2592", metadata.get(Metadata.IMAGE_LENGTH));
         assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE));
         assertEquals(null, metadata.get(Metadata.SAMPLES_PER_PIXEL));
 
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
index bb87262..b9b9504 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
@@ -255,7 +255,7 @@ public class TesseractOCRParserTest extends TikaTest {
         m = getXML("testTIFF.tif").metadata;
         assertEquals("100", m.get(Metadata.IMAGE_WIDTH));
         assertEquals("75", m.get(Metadata.IMAGE_LENGTH));
-        assertEquals("72 dots per inch", m.get("Y Resolution"));
+        assertEquals("72 dots per inch", m.get("Exif IFD0:Y Resolution"));
     }
 
     //TODO: add unit tests for jp2/jpx/ppm TIKA-2174
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
index 3a22bdd..f19cab0 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
@@ -468,8 +468,8 @@ public class RTFParserTest extends TikaTest {
         assertEquals("false", meta_jpg.get(RTFMetadata.THUMBNAIL));
         assertEquals("false", meta_jpg_exif.get(RTFMetadata.THUMBNAIL));
 
-        assertEquals(51, meta_jpg.names().length);
-        assertEquals(115, meta_jpg_exif.names().length);
+        assertEquals(50, meta_jpg.names().length);
+        assertEquals(116, meta_jpg_exif.names().length);
     }
 
     @Test