You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/06/16 16:50:55 UTC

[tika] branch master updated: add heif mimetype support (#278)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/master by this push:
     new d9dbe0c  add heif mimetype support (#278)
d9dbe0c is described below

commit d9dbe0c8ca20b5954f04b0633ec1e34bd5ffc4d9
Author: Christian <me...@rndm.de>
AuthorDate: Tue Jun 16 18:50:46 2020 +0200

    add heif mimetype support (#278)
    
    fixes TIKA-2830
---
 .../org/apache/tika/mime/tika-mimetypes.xml        |  32 ++++++++++
 .../org/apache/tika/parser/image/HeifParser.java   |  68 +++++++++++++++++++++
 .../tika/parser/image/ImageMetadataExtractor.java  |  14 +++++
 .../services/org.apache.tika.parser.Parser         |   1 +
 .../apache/tika/parser/image/HeifParserTest.java   |  58 ++++++++++++++++++
 .../test/resources/test-documents/IMG_1034.heic    | Bin 0 -> 1499892 bytes
 6 files changed, 173 insertions(+)

diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index 3d2cc28..f8b44f1 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -5763,6 +5763,38 @@
     <glob pattern="*.webp"/>
   </mime-type>
 
+  <mime-type type="image/heic">
+    <tika:link>https://en.wikipedia.org/wiki/High_Efficiency_Image_File_Format</tika:link>
+    <magic priority="50">
+      <match value="ftypheic" type="string" offset="4"/>
+      <match value="ftypheix" type="string" offset="4"/>
+    </magic>
+    <glob pattern="*.heic"/>
+  </mime-type>
+
+  <mime-type type="image/heif">
+    <tika:link>https://en.wikipedia.org/wiki/High_Efficiency_Image_File_Format</tika:link>
+    <magic priority="50">
+      <match value="ftypmif1" type="string" offset="4"/>
+    </magic>
+    <glob pattern="*.heif"/>
+  </mime-type>
+
+  <mime-type type="image/heif-sequence">
+    <tika:link>https://en.wikipedia.org/wiki/High_Efficiency_Image_File_Format</tika:link>
+    <magic priority="50">
+      <match value="ftypmsf1" type="string" offset="4"/>
+    </magic>
+  </mime-type>
+
+  <mime-type type="image/heic-sequence">
+    <tika:link>https://en.wikipedia.org/wiki/High_Efficiency_Image_File_Format</tika:link>
+    <magic priority="50">
+      <match value="ftyphevc" type="string" offset="4"/>
+      <match value="ftyphevx" type="string" offset="4"/>
+    </magic>
+  </mime-type>
+
   <mime-type type="image/wmf">
     <alias type="image/x-wmf"/>
     <alias type="application/x-msmetafile"/>
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/image/HeifParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/image/HeifParser.java
new file mode 100644
index 0000000..9880d3c
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/image/HeifParser.java
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.image;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+
+public class HeifParser extends AbstractParser {
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            new HashSet<>(
+                    Arrays.asList(
+                            MediaType.image("heif"),
+                            MediaType.image("heif-sequence"),
+                            MediaType.image("heic"),
+                            MediaType.image("heic-sequence")
+                            )
+            );
+
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    @Override
+    public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
+        TemporaryResources tmp = new TemporaryResources();
+        try {
+            TikaInputStream tis = TikaInputStream.get(stream, tmp);
+            new ImageMetadataExtractor(metadata).parseHeif(tis.getFile());
+        } finally {
+            tmp.dispose();
+        }
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+        xhtml.endDocument();
+
+    }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
index 0ec472c..6645feb 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
@@ -20,6 +20,7 @@ import java.io.ByteArrayInputStream;
 import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.FileInputStream;
 import java.text.DecimalFormat;
 import java.text.DecimalFormatSymbols;
 import java.text.SimpleDateFormat;
@@ -29,6 +30,7 @@ import java.util.Locale;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
+import com.drew.imaging.heif.HeifMetadataReader;
 import com.drew.imaging.jpeg.JpegMetadataReader;
 import com.drew.imaging.jpeg.JpegProcessingException;
 import com.drew.imaging.riff.RiffProcessingException;
@@ -150,6 +152,18 @@ public class ImageMetadataExtractor {
         }
     }
 
+    public void parseHeif(File file) throws IOException, TikaException {
+        try {
+            com.drew.metadata.Metadata heifMetadata = new com.drew.metadata.Metadata();
+            heifMetadata = HeifMetadataReader.readMetadata(new FileInputStream(file));
+            handle(heifMetadata);
+        } catch (IOException e) {
+            throw e;
+        } catch (MetadataException e) {
+            throw new TikaException("Can't process Heif data", e);
+        }
+    }
+
     public void parseRawExif(InputStream stream, int length, boolean needsExifHeader)
             throws IOException, SAXException, TikaException {
         byte[] exif;
diff --git a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
index 252f31c..7725f8c 100644
--- a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
+++ b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
@@ -35,6 +35,7 @@ org.apache.tika.parser.image.ImageParser
 org.apache.tika.parser.image.PSDParser
 org.apache.tika.parser.image.TiffParser
 org.apache.tika.parser.image.WebPParser
+org.apache.tika.parser.image.HeifParser
 org.apache.tika.parser.iptc.IptcAnpaParser
 org.apache.tika.parser.iwork.IWorkPackageParser
 org.apache.tika.parser.jpeg.JpegParser
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/image/HeifParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/image/HeifParserTest.java
new file mode 100644
index 0000000..681b616
--- /dev/null
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/image/HeifParserTest.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.image;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.junit.Test;
+import org.xml.sax.helpers.DefaultHandler;
+
+import java.io.InputStream;
+
+import static org.junit.Assert.assertEquals;
+
+
+public class HeifParserTest {
+
+    Parser parser = new AutoDetectParser();
+
+    /*
+        Example photo in test-documents (IMG_1034.heic)
+        are in the public domain.  These files were retrieved from:
+        https://github.com/drewnoakes/metadata-extractor-images/tree/master/heic
+     */
+    @Test
+    public void testSimple() throws Exception {
+        Metadata metadata = new Metadata();
+        InputStream stream =
+                getClass().getResourceAsStream("/test-documents/IMG_1034.heic");
+
+        parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
+
+        assertEquals("heic", metadata.get("Major Brand"));
+        assertEquals("512 pixels", metadata.get("Width"));
+        assertEquals("512 pixels", metadata.get("Height"));
+        assertEquals("image/heic", metadata.get(Metadata.CONTENT_TYPE));
+
+        IOUtils.closeQuietly(stream);
+    }
+
+}
diff --git a/tika-parsers/src/test/resources/test-documents/IMG_1034.heic b/tika-parsers/src/test/resources/test-documents/IMG_1034.heic
new file mode 100644
index 0000000..9c63182
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/IMG_1034.heic differ