You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/06/16 18:14:06 UTC
[tika] 10/13: add heif mimetype support (#278)
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 6d066cbcd4589831d4f74f977479d4d902f0e5df
Author: Christian <me...@rndm.de>
AuthorDate: Tue Jun 16 18:50:46 2020 +0200
add heif mimetype support (#278)
fixes TIKA-2830
---
.../org/apache/tika/mime/tika-mimetypes.xml | 32 ++++++++++
.../org/apache/tika/parser/image/HeifParser.java | 68 +++++++++++++++++++++
.../tika/parser/image/ImageMetadataExtractor.java | 14 +++++
.../services/org.apache.tika.parser.Parser | 1 +
.../apache/tika/parser/image/HeifParserTest.java | 58 ++++++++++++++++++
.../test/resources/test-documents/IMG_1034.heic | Bin 0 -> 1499892 bytes
6 files changed, 173 insertions(+)
diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index a7fe00e..64d7dfe 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -5582,6 +5582,38 @@
<glob pattern="*.webp"/>
</mime-type>
+ <mime-type type="image/heic">
+ <tika:link>https://en.wikipedia.org/wiki/High_Efficiency_Image_File_Format</tika:link>
+ <magic priority="50">
+ <match value="ftypheic" type="string" offset="4"/>
+ <match value="ftypheix" type="string" offset="4"/>
+ </magic>
+ <glob pattern="*.heic"/>
+ </mime-type>
+
+ <mime-type type="image/heif">
+ <tika:link>https://en.wikipedia.org/wiki/High_Efficiency_Image_File_Format</tika:link>
+ <magic priority="50">
+ <match value="ftypmif1" type="string" offset="4"/>
+ </magic>
+ <glob pattern="*.heif"/>
+ </mime-type>
+
+ <mime-type type="image/heif-sequence">
+ <tika:link>https://en.wikipedia.org/wiki/High_Efficiency_Image_File_Format</tika:link>
+ <magic priority="50">
+ <match value="ftypmsf1" type="string" offset="4"/>
+ </magic>
+ </mime-type>
+
+ <mime-type type="image/heic-sequence">
+ <tika:link>https://en.wikipedia.org/wiki/High_Efficiency_Image_File_Format</tika:link>
+ <magic priority="50">
+ <match value="ftyphevc" type="string" offset="4"/>
+ <match value="ftyphevx" type="string" offset="4"/>
+ </magic>
+ </mime-type>
+
<mime-type type="image/wmf">
<alias type="image/x-wmf"/>
<alias type="application/x-msmetafile"/>
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/image/HeifParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/image/HeifParser.java
new file mode 100644
index 0000000..9880d3c
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/image/HeifParser.java
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.image;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+
+public class HeifParser extends AbstractParser {
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ new HashSet<>(
+ Arrays.asList(
+ MediaType.image("heif"),
+ MediaType.image("heif-sequence"),
+ MediaType.image("heic"),
+ MediaType.image("heic-sequence")
+ )
+ );
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ @Override
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
+ TemporaryResources tmp = new TemporaryResources();
+ try {
+ TikaInputStream tis = TikaInputStream.get(stream, tmp);
+ new ImageMetadataExtractor(metadata).parseHeif(tis.getFile());
+ } finally {
+ tmp.dispose();
+ }
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ xhtml.endDocument();
+
+ }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
index 912c0f1..622f48a 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
@@ -20,6 +20,7 @@ import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
+import java.io.FileInputStream;
import java.text.DecimalFormat;
import java.text.DecimalFormatSymbols;
import java.text.SimpleDateFormat;
@@ -29,6 +30,7 @@ import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
+import com.drew.imaging.heif.HeifMetadataReader;
import com.drew.imaging.jpeg.JpegMetadataReader;
import com.drew.imaging.jpeg.JpegProcessingException;
import com.drew.imaging.riff.RiffProcessingException;
@@ -150,6 +152,18 @@ public class ImageMetadataExtractor {
}
}
+ public void parseHeif(File file) throws IOException, TikaException {
+ try {
+ com.drew.metadata.Metadata heifMetadata = new com.drew.metadata.Metadata();
+ heifMetadata = HeifMetadataReader.readMetadata(new FileInputStream(file));
+ handle(heifMetadata);
+ } catch (IOException e) {
+ throw e;
+ } catch (MetadataException e) {
+ throw new TikaException("Can't process Heif data", e);
+ }
+ }
+
public void parseRawExif(InputStream stream, int length, boolean needsExifHeader)
throws IOException, SAXException, TikaException {
byte[] exif;
diff --git a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
index ceb1399..f0bdb01 100644
--- a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
+++ b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
@@ -34,6 +34,7 @@ org.apache.tika.parser.image.ImageParser
org.apache.tika.parser.image.PSDParser
org.apache.tika.parser.image.TiffParser
org.apache.tika.parser.image.WebPParser
+org.apache.tika.parser.image.HeifParser
org.apache.tika.parser.iptc.IptcAnpaParser
org.apache.tika.parser.iwork.IWorkPackageParser
org.apache.tika.parser.jpeg.JpegParser
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/image/HeifParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/image/HeifParserTest.java
new file mode 100644
index 0000000..681b616
--- /dev/null
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/image/HeifParserTest.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.image;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.junit.Test;
+import org.xml.sax.helpers.DefaultHandler;
+
+import java.io.InputStream;
+
+import static org.junit.Assert.assertEquals;
+
+
+public class HeifParserTest {
+
+ Parser parser = new AutoDetectParser();
+
+ /*
+ Example photo in test-documents (IMG_1034.heic)
+ are in the public domain. These files were retrieved from:
+ https://github.com/drewnoakes/metadata-extractor-images/tree/master/heic
+ */
+ @Test
+ public void testSimple() throws Exception {
+ Metadata metadata = new Metadata();
+ InputStream stream =
+ getClass().getResourceAsStream("/test-documents/IMG_1034.heic");
+
+ parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
+
+ assertEquals("heic", metadata.get("Major Brand"));
+ assertEquals("512 pixels", metadata.get("Width"));
+ assertEquals("512 pixels", metadata.get("Height"));
+ assertEquals("image/heic", metadata.get(Metadata.CONTENT_TYPE));
+
+ IOUtils.closeQuietly(stream);
+ }
+
+}
diff --git a/tika-parsers/src/test/resources/test-documents/IMG_1034.heic b/tika-parsers/src/test/resources/test-documents/IMG_1034.heic
new file mode 100644
index 0000000..9c63182
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/IMG_1034.heic differ