You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/06/15 16:09:39 UTC
[tika] branch main updated: TIKA-3412 --
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new dbebeb5 TIKA-3412 --
dbebeb5 is described below
commit dbebeb5ebc6aa139bc3660850d1f9947fb711c46
Author: tallison <ta...@apache.org>
AuthorDate: Tue Jun 15 12:09:14 2021 -0400
TIKA-3412 --
---
CHANGES.txt | 2 +
tika-parent/pom.xml | 2 +-
.../tika-parser-audiovideo-module/pom.xml | 5 +
.../mp4/{MP4Parser.java => LegacyMP4Parser.java} | 2 +-
.../java/org/apache/tika/parser/mp4/MP4Parser.java | 351 ++++++++-------------
.../apache/tika/parser/mp4/TikaMp4BoxHandler.java | 63 ++++
.../tika/parser/mp4/boxes/TikaUserDataBox.java | 247 +++++++++++++++
...MP4ParserTest.java => LegacyMP4ParserTest.java} | 10 +-
.../org/apache/tika/parser/mp4/MP4ParserTest.java | 17 +-
.../tika-parser-image-module/pom.xml | 2 +-
10 files changed, 458 insertions(+), 243 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 84be502..098bad2 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -35,6 +35,8 @@ Release 2.0.0-ALPHA - 01/13/2021
what has been changed in the config object.
* We are now using non-shaded versions of xmpcore with namespaces com.adobe.internal.*
vs com.adobe.*.
+ * We switched the underlying MP4 parser to Drew Noakes metadata-extractor's MP4 parser
+ from sannies' isoparser.
* tika-parsers
* The parser modules have been broken into three main modules:
diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml
index 8e52aa3..5cf2d85 100644
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@ -328,7 +328,7 @@
<log4j2.version>2.14.1</log4j2.version>
<lombok.version>1.18.20</lombok.version>
<lucene.version>8.8.2</lucene.version>
- <metadata.extractor.version>2.15.0.1</metadata.extractor.version>
+ <metadata.extractor.version>2.16.0</metadata.extractor.version>
<microsoft.translator.version>0.6.2</microsoft.translator.version>
<mime4j.version>0.8.4</mime4j.version>
<mockito.version>3.7.7</mockito.version>
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/pom.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/pom.xml
index c968347..64a7aab 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/pom.xml
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/pom.xml
@@ -34,6 +34,11 @@
<artifactId>isoparser</artifactId>
<version>${isoparser.version}</version>
</dependency>
+ <dependency>
+ <groupId>com.drewnoakes</groupId>
+ <artifactId>metadata-extractor</artifactId>
+ <version>${metadata.extractor.version}</version>
+ </dependency>
</dependencies>
<build>
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp4/MP4Parser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp4/LegacyMP4Parser.java
similarity index 99%
copy from tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp4/MP4Parser.java
copy to tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp4/LegacyMP4Parser.java
index e9e21c4..5a2936c 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp4/MP4Parser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp4/LegacyMP4Parser.java
@@ -81,7 +81,7 @@ import org.apache.tika.sax.XHTMLContentHandler;
* This uses the MP4Parser project from http://code.google.com/p/mp4parser/
* to do the underlying parsing
*/
-public class MP4Parser extends AbstractParser {
+public class LegacyMP4Parser extends AbstractParser {
/**
* Serial version UID
*/
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp4/MP4Parser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp4/MP4Parser.java
index e9e21c4..00861e7 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp4/MP4Parser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp4/MP4Parser.java
@@ -16,12 +16,16 @@
*/
package org.apache.tika.parser.mp4;
+import java.io.File;
import java.io.IOException;
import java.io.InputStream;
+import java.nio.file.Files;
import java.text.DecimalFormat;
import java.text.NumberFormat;
import java.util.Arrays;
+import java.util.Collection;
import java.util.Collections;
+import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
@@ -29,50 +33,30 @@ import java.util.Map;
import java.util.Optional;
import java.util.Set;
-import org.mp4parser.Box;
-import org.mp4parser.Container;
-import org.mp4parser.IsoFile;
-import org.mp4parser.boxes.apple.AppleAlbumBox;
-import org.mp4parser.boxes.apple.AppleArtist2Box;
-import org.mp4parser.boxes.apple.AppleArtistBox;
-import org.mp4parser.boxes.apple.AppleCommentBox;
-import org.mp4parser.boxes.apple.AppleCompilationBox;
-import org.mp4parser.boxes.apple.AppleDiskNumberBox;
-import org.mp4parser.boxes.apple.AppleEncoderBox;
-import org.mp4parser.boxes.apple.AppleGPSCoordinatesBox;
-import org.mp4parser.boxes.apple.AppleGenreBox;
-import org.mp4parser.boxes.apple.AppleItemListBox;
-import org.mp4parser.boxes.apple.AppleNameBox;
-import org.mp4parser.boxes.apple.AppleRecordingYear2Box;
-import org.mp4parser.boxes.apple.AppleTrackAuthorBox;
-import org.mp4parser.boxes.apple.AppleTrackNumberBox;
-import org.mp4parser.boxes.apple.Utf8AppleDataBox;
-import org.mp4parser.boxes.iso14496.part12.FileTypeBox;
-import org.mp4parser.boxes.iso14496.part12.MetaBox;
-import org.mp4parser.boxes.iso14496.part12.MovieBox;
-import org.mp4parser.boxes.iso14496.part12.MovieHeaderBox;
-import org.mp4parser.boxes.iso14496.part12.SampleDescriptionBox;
-import org.mp4parser.boxes.iso14496.part12.SampleTableBox;
-import org.mp4parser.boxes.iso14496.part12.TrackBox;
-import org.mp4parser.boxes.iso14496.part12.TrackHeaderBox;
-import org.mp4parser.boxes.iso14496.part12.UserDataBox;
-import org.mp4parser.boxes.sampleentry.AudioSampleEntry;
+import com.drew.imaging.mp4.Mp4Reader;
+import com.drew.metadata.Directory;
+import com.drew.metadata.MetadataException;
+import com.drew.metadata.Tag;
+import com.drew.metadata.mp4.Mp4BoxHandler;
+import com.drew.metadata.mp4.Mp4Directory;
+import com.drew.metadata.mp4.media.Mp4SoundDirectory;
+import com.drew.metadata.mp4.media.Mp4VideoDirectory;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
-import org.apache.tika.config.Field;
+import org.apache.tika.exception.RuntimeSAXException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.metadata.XMP;
import org.apache.tika.metadata.XMPDM;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.StringUtils;
/**
* Parser for the MP4 media container format, as well as the older
@@ -116,24 +100,6 @@ public class MP4Parser extends AbstractParser {
private ISO6709Extractor iso6709Extractor = new ISO6709Extractor();
- private static void addMetadata(Property prop, Metadata m, Utf8AppleDataBox metadata) {
- if (metadata != null) {
- m.set(prop, metadata.getValue());
- }
- }
-
- private static <T extends Box> T getOrNull(Container box, Class<T> clazz) {
- if (box == null) {
- return null;
- }
-
- List<T> boxes = box.getBoxes(clazz);
- if (boxes.size() == 0) {
- return null;
- }
- return boxes.get(0);
- }
-
public Set<MediaType> getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
}
@@ -141,228 +107,157 @@ public class MP4Parser extends AbstractParser {
public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
ParseContext context) throws IOException, SAXException, TikaException {
- // The MP4Parser library accepts either a File, or a byte array
- // As MP4 video files are typically large, always use a file to
- // avoid OOMs that may occur with in-memory buffering
TemporaryResources tmp = new TemporaryResources();
TikaInputStream tstream = TikaInputStream.get(stream, tmp);
- try (IsoFile isoFile = new IsoFile(tstream.getFile())) {
-
- // Grab the file type box
- FileTypeBox fileType = getOrNull(isoFile, FileTypeBox.class);
- if (fileType != null) {
- // Identify the type based on the major brand
- Optional<MediaType> typeHolder = typesMap.entrySet().stream()
- .filter(e -> e.getValue().contains(fileType.getMajorBrand())).findFirst()
- .map(Map.Entry::getKey);
-
- if (!typeHolder.isPresent()) {
- // If no match for major brand, see if any of the compatible brands match
- typeHolder = typesMap.entrySet().stream().filter(e -> e.getValue().stream()
- .anyMatch(fileType.getCompatibleBrands()::contains)).findFirst()
- .map(Map.Entry::getKey);
- }
-
- MediaType type = typeHolder.orElse(MediaType.application("mp4"));
- metadata.set(Metadata.CONTENT_TYPE, type.toString());
-
- if (type.getType().equals("audio")) {
- metadata.set(XMPDM.AUDIO_COMPRESSOR, fileType.getMajorBrand().trim());
- }
- } else {
- // Some older QuickTime files lack the FileType
- metadata.set(Metadata.CONTENT_TYPE, "video/quicktime");
- }
-
-
- // Get the main MOOV box
- MovieBox moov = getOrNull(isoFile, MovieBox.class);
- if (moov == null) {
- // Bail out
- return;
- }
-
+ try (InputStream is = Files.newInputStream(tstream.getPath())) {
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
-
- handleMovieHeaderBox(moov, metadata, xhtml);
- handleTrackBoxes(moov, metadata, xhtml);
-
- // Get metadata from the User Data Box
- UserDataBox userData = getOrNull(moov, UserDataBox.class);
- if (userData != null) {
- extractGPS(userData, metadata);
- MetaBox metaBox = getOrNull(userData, MetaBox.class);
-
- // Check for iTunes Metadata
- // See http://atomicparsley.sourceforge.net/mpeg-4files.html and
- // http://code.google.com/p/mp4v2/wiki/iTunesMetadata for more on these
- handleApple(metaBox, metadata, xhtml);
- // TODO Check for other kinds too
+ com.drew.metadata.Metadata mp4Metadata = new com.drew.metadata.Metadata();
+ Mp4BoxHandler boxHandler = new TikaMp4BoxHandler(mp4Metadata, metadata, xhtml);
+ try {
+ Mp4Reader.extract(is, boxHandler);
+ } catch (RuntimeSAXException e) {
+ throw (SAXException) e.getCause();
}
-
- // All done
+ //TODO -- figure out how to get IOExceptions out of boxhandler. Mp4Reader
+ //currently swallows IOExceptions.
+ processMp4Directories(mp4Metadata.getDirectoriesOfType(Mp4Directory.class), metadata);
xhtml.endDocument();
-
} finally {
tmp.dispose();
}
-
}
- private void handleTrackBoxes(MovieBox moov, Metadata metadata, XHTMLContentHandler xhtml) {
-
- // Get some more information from the track header
- // TODO Decide how to handle multiple tracks
- List<TrackBox> tb = moov.getBoxes(TrackBox.class);
- if (tb == null || tb.size() == 0) {
- return;
+ private void processMp4Directories(Collection<Mp4Directory> mp4Directories, Metadata metadata) {
+ for (Mp4Directory mp4Directory : mp4Directories) {
+ if (mp4Directory instanceof Mp4SoundDirectory) {
+ processMp4SoundDirectory((Mp4SoundDirectory) mp4Directory, metadata);
+ } else if (mp4Directory instanceof Mp4VideoDirectory) {
+ processMp4VideoDirectory((Mp4VideoDirectory) mp4Directory, metadata);
+ } else {
+ processActualMp4Directory(mp4Directory, metadata);
+ }
}
- TrackBox track = tb.get(0);
-
- TrackHeaderBox header = track.getTrackHeaderBox();
- // Get the creation and modification dates
- metadata.set(TikaCoreProperties.CREATED, header.getCreationTime());
- metadata.set(TikaCoreProperties.MODIFIED, header.getModificationTime());
+ }
- // Get the video with and height
- metadata.set(Metadata.IMAGE_WIDTH, (int) header.getWidth());
- metadata.set(Metadata.IMAGE_LENGTH, (int) header.getHeight());
+ private void processMp4VideoDirectory(Mp4VideoDirectory mp4Directory, Metadata metadata) {
+ //todo
+ }
- // Get the sample information
- SampleTableBox samples = track.getSampleTableBox();
- if (samples != null) {
- SampleDescriptionBox sampleDesc = samples.getSampleDescriptionBox();
- if (sampleDesc != null) {
- // Look for the first Audio Sample, if present
- AudioSampleEntry sample = getOrNull(sampleDesc, AudioSampleEntry.class);
- if (sample != null) {
- XMPDM.ChannelTypePropertyConverter
- .convertAndSet(metadata, sample.getChannelCount());
- //metadata.set(XMPDM.AUDIO_SAMPLE_TYPE, sample.getSampleSize());
- // TODO Num -> Type mapping
- metadata.set(XMPDM.AUDIO_SAMPLE_RATE, (int) sample.getSampleRate());
- //metadata.set(XMPDM.AUDIO_, sample.getSamplesPerPacket());
- //metadata.set(XMPDM.AUDIO_, sample.getBytesPerSample());
- }
+ private void processMp4SoundDirectory(Mp4SoundDirectory mp4SoundDirectory,
+ Metadata metadata) {
+ addInt(mp4SoundDirectory, metadata, Mp4SoundDirectory.TAG_AUDIO_SAMPLE_RATE,
+ XMPDM.AUDIO_SAMPLE_RATE);
+
+ try {
+ int numChannels = mp4SoundDirectory.getInt(Mp4SoundDirectory.TAG_NUMBER_OF_CHANNELS);
+ if (numChannels == 1) {
+ metadata.set(XMPDM.AUDIO_CHANNEL_TYPE, "Mono");
+ } else if (numChannels == 2) {
+ metadata.set(XMPDM.AUDIO_CHANNEL_TYPE, "Stereo");
+ } else {
+ //??? log
}
+ } catch (MetadataException e) {
+ //log
}
}
- private void handleMovieHeaderBox(MovieBox moov, Metadata metadata, XHTMLContentHandler xhtml) {
- // Pull out some information from the header box
- MovieHeaderBox mHeader = getOrNull(moov, MovieHeaderBox.class);
- if (mHeader == null) {
- return;
+ private void addInt(Mp4Directory mp4Directory, Metadata metadata, int tag,
+ Property property) {
+ try {
+ int val = mp4Directory.getInt(tag);
+ metadata.set(property, val);
+ } catch (MetadataException e) {
+ //log
}
- // Get the creation and modification dates
- metadata.set(TikaCoreProperties.CREATED, mHeader.getCreationTime());
- metadata.set(TikaCoreProperties.MODIFIED, mHeader.getModificationTime());
-
- // Get the duration
- double durationSeconds = ((double) mHeader.getDuration()) / mHeader.getTimescale();
- metadata.set(XMPDM.DURATION, DURATION_FORMAT.format(durationSeconds));
-
- // The timescale is normally the sampling rate
- metadata.set(XMPDM.AUDIO_SAMPLE_RATE, (int) mHeader.getTimescale());
}
- private void handleApple(MetaBox metaBox, Metadata metadata, XHTMLContentHandler xhtml)
- throws SAXException {
- AppleItemListBox apple = getOrNull(metaBox, AppleItemListBox.class);
- if (apple == null) {
- return;
- }
- // Title
- AppleNameBox title = getOrNull(apple, AppleNameBox.class);
- addMetadata(TikaCoreProperties.TITLE, metadata, title);
+ private void processActualMp4Directory(Mp4Directory mp4Directory, Metadata metadata) {
+ addDate(mp4Directory, metadata, Mp4Directory.TAG_CREATION_TIME, TikaCoreProperties.CREATED);
+ addDate(mp4Directory, metadata, Mp4Directory.TAG_MODIFICATION_TIME,
+ TikaCoreProperties.MODIFIED);
+ handleBrands(mp4Directory, metadata);
+ handleDurationInSeconds(mp4Directory, metadata);
- // Artist
- AppleArtistBox artist = getOrNull(apple, AppleArtistBox.class);
- addMetadata(TikaCoreProperties.CREATOR, metadata, artist);
- addMetadata(XMPDM.ARTIST, metadata, artist);
+ addDouble(mp4Directory, metadata, Mp4Directory.TAG_LATITUDE, TikaCoreProperties.LATITUDE);
+ addDouble(mp4Directory, metadata, Mp4Directory.TAG_LONGITUDE, TikaCoreProperties.LONGITUDE);
- // Album Artist
- AppleArtist2Box artist2 = getOrNull(apple, AppleArtist2Box.class);
- addMetadata(XMPDM.ALBUM_ARTIST, metadata, artist2);
-
- // Album
- AppleAlbumBox album = getOrNull(apple, AppleAlbumBox.class);
- addMetadata(XMPDM.ALBUM, metadata, album);
-
- // Composer
- AppleTrackAuthorBox composer = getOrNull(apple, AppleTrackAuthorBox.class);
- addMetadata(XMPDM.COMPOSER, metadata, composer);
-
- // Genre
- AppleGenreBox genre = getOrNull(apple, AppleGenreBox.class);
- addMetadata(XMPDM.GENRE, metadata, genre);
+ }
- // Year
- AppleRecordingYear2Box year = getOrNull(apple, AppleRecordingYear2Box.class);
- if (year != null) {
- metadata.set(XMPDM.RELEASE_DATE, year.getValue());
+ private void handleDurationInSeconds(Mp4Directory mp4Directory, Metadata metadata) {
+ String durationInSeconds = mp4Directory.getString(Mp4Directory.TAG_DURATION_SECONDS);
+ if (durationInSeconds == null) {
+ return;
}
-
- // Track number
- AppleTrackNumberBox trackNum = getOrNull(apple, AppleTrackNumberBox.class);
- if (trackNum != null) {
- metadata.set(XMPDM.TRACK_NUMBER, trackNum.getA());
- //metadata.set(XMPDM.NUMBER_OF_TRACKS, trackNum.getB()); // TODO
+ if (! durationInSeconds.contains("/")) {
+ return;
}
-
- // Disc number
- AppleDiskNumberBox discNum = getOrNull(apple, AppleDiskNumberBox.class);
- if (discNum != null) {
- metadata.set(XMPDM.DISC_NUMBER, discNum.getA());
+ String[] bits = durationInSeconds.split("/");
+ if (bits.length != 2) {
+ return;
}
-
- // Compilation
- AppleCompilationBox compilation = getOrNull(apple, AppleCompilationBox.class);
- if (compilation != null) {
- metadata.set(XMPDM.COMPILATION, (int) compilation.getValue());
+ double durationSeconds;
+ try {
+ long numerator = Long.parseLong(bits[0]);
+ long denominator = Long.parseLong(bits[1]);
+ durationSeconds = (double)numerator/(double)denominator;
+ } catch (NumberFormatException e) {
+ //log
+ return;
}
+ // Get the duration
+ metadata.set(XMPDM.DURATION, DURATION_FORMAT.format(durationSeconds));
+ }
- // Comment
- AppleCommentBox comment = getOrNull(apple, AppleCommentBox.class);
- addMetadata(XMPDM.LOG_COMMENT, metadata, comment);
+ private void handleBrands(Mp4Directory mp4Directory, Metadata metadata) {
- // Encoder
- AppleEncoderBox encoder = getOrNull(apple, AppleEncoderBox.class);
- if (encoder != null) {
- metadata.set(XMP.CREATOR_TOOL, encoder.getValue());
- }
+ String majorBrand = mp4Directory.getString(Mp4Directory.TAG_MAJOR_BRAND);
+ // Identify the type based on the major brand
+ Optional<MediaType> typeHolder = typesMap.entrySet().stream()
+ .filter(e -> e.getValue().contains(majorBrand)).findFirst()
+ .map(Map.Entry::getKey);
- // As text
- for (Box box : apple.getBoxes()) {
- if (box instanceof Utf8AppleDataBox) {
- xhtml.element("p", ((Utf8AppleDataBox) box).getValue());
+ if (!typeHolder.isPresent()) {
+ String compatibleBrands =
+ mp4Directory.getString(Mp4Directory.TAG_COMPATIBLE_BRANDS);
+ if (compatibleBrands != null) {
+ // If no match for major brand, see if any of the compatible brands match
+ typeHolder = typesMap.entrySet().stream().filter(e ->
+ e.getValue().stream().anyMatch(compatibleBrands::contains))
+ .findFirst().map(Map.Entry::getKey);
}
}
+ MediaType type = typeHolder.orElse(MediaType.application("mp4"));
+ metadata.set(Metadata.CONTENT_TYPE, type.toString());
+ if (type.getType().equals("audio") && ! StringUtils.isBlank(majorBrand)) {
+ metadata.set(XMPDM.AUDIO_COMPRESSOR, majorBrand.trim());
+ }
}
- /**
- * Override the maximum record size limit. NOTE: this
- * sets a static variable on the IsoFile and affects all files
- * parsed in this JVM!!!
- *
- * @param maxRecordSize
- */
- @Field
- public void setMaxRecordSize(long maxRecordSize) {
- IsoFile.MAX_RECORD_SIZE_OVERRIDE = maxRecordSize;
+ private void addDate(Mp4Directory mp4Directory, Metadata metadata, int tag,
+ Property property) {
+ Date d = mp4Directory.getDate(tag);
+ if (d == null) {
+ return;
+ }
+ metadata.set(property, d);
+
}
- private void extractGPS(UserDataBox userData, Metadata metadata) {
- AppleGPSCoordinatesBox coordBox = getOrNull(userData, AppleGPSCoordinatesBox.class);
- if (coordBox == null) {
+ private void addDouble(Directory mp4Directory, Metadata metadata, int tag,
+ Property property) {
+ try {
+ double val = mp4Directory.getDouble(tag);
+ metadata.set(property, val);
+ } catch (MetadataException e) {
+ //log
return;
}
- String iso6709 = coordBox.getValue();
- iso6709Extractor.extract(iso6709, metadata);
+
}
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp4/TikaMp4BoxHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp4/TikaMp4BoxHandler.java
new file mode 100644
index 0000000..6cb6ccc
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp4/TikaMp4BoxHandler.java
@@ -0,0 +1,63 @@
+package org.apache.tika.parser.mp4;
+
+import java.io.IOException;
+
+import com.drew.imaging.mp4.Mp4Handler;
+import com.drew.lang.annotations.NotNull;
+import com.drew.lang.annotations.Nullable;
+import com.drew.metadata.Metadata;
+import com.drew.metadata.mp4.Mp4BoxHandler;
+import com.drew.metadata.mp4.Mp4Context;
+import com.drew.metadata.mp4.boxes.Box;
+import org.xml.sax.SAXException;
+
+import org.apache.tika.parser.mp4.boxes.TikaUserDataBox;
+import org.apache.tika.sax.XHTMLContentHandler;
+
+public class TikaMp4BoxHandler extends Mp4BoxHandler {
+
+ org.apache.tika.metadata.Metadata tikaMetadata;
+ final XHTMLContentHandler xhtml;
+ public TikaMp4BoxHandler(Metadata metadata, org.apache.tika.metadata.Metadata tikaMetadata,
+ XHTMLContentHandler xhtml) {
+ super(metadata);
+ this.tikaMetadata = tikaMetadata;
+ this.xhtml = xhtml;
+ }
+
+ @Override
+ public boolean shouldAcceptBox(@NotNull Box box) {
+ if (box.type.equals("udta")) {
+ return true;
+ }
+ return super.shouldAcceptBox(box);
+ }
+
+ @Override
+ public boolean shouldAcceptContainer(@NotNull Box box) {
+ return super.shouldAcceptContainer(box);
+ }
+
+ @Override
+ public Mp4Handler<?> processBox(@NotNull Box box, @Nullable byte[] payload, Mp4Context context)
+ throws IOException {
+ if (box.type.equals("udta")) {
+ return processUserData(box, payload, context);
+ }
+
+ return super.processBox(box, payload, context);
+ }
+
+
+ private Mp4Handler<?> processUserData(Box box, byte[] payload, Mp4Context context) throws IOException {
+ if (payload == null) {
+ return this;
+ }
+ try {
+ new TikaUserDataBox(box, payload, tikaMetadata, xhtml).addMetadata(directory);
+ } catch (SAXException e) {
+ throw new IOException(e);
+ }
+ return this;
+ }
+}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp4/boxes/TikaUserDataBox.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp4/boxes/TikaUserDataBox.java
new file mode 100644
index 0000000..c76b3e0
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp4/boxes/TikaUserDataBox.java
@@ -0,0 +1,247 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp4.boxes;
+
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import com.drew.lang.SequentialByteArrayReader;
+import com.drew.lang.SequentialReader;
+import com.drew.lang.annotations.NotNull;
+import com.drew.lang.annotations.Nullable;
+import com.drew.metadata.mp4.Mp4Directory;
+import com.drew.metadata.mp4.boxes.Box;
+import org.xml.sax.SAXException;
+
+import org.apache.tika.exception.RuntimeSAXException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.XMP;
+import org.apache.tika.metadata.XMPDM;
+import org.apache.tika.sax.XHTMLContentHandler;
+
+public class TikaUserDataBox extends Box {
+
+ private static final String LOCATION_CODE = "\u00A9xyz";
+ private static final Pattern COORDINATE_PATTERN =
+ Pattern.compile("([+-]\\d+\\.\\d+)([+-]\\d+\\.\\d+)");
+
+ @Nullable
+ private String coordinateString;
+
+ private boolean isQuickTime = false;
+ private final Metadata metadata;
+ private final XHTMLContentHandler xhtml;
+ public TikaUserDataBox(@NotNull Box box, byte[] payload, Metadata metadata,
+ XHTMLContentHandler xhtml) throws IOException, SAXException {
+ super(box);
+ this.metadata = metadata;
+ this.xhtml = xhtml;
+ int length = payload.length;
+ SequentialReader reader = new SequentialByteArrayReader(payload);
+ while (reader.getPosition() < (long) length) {
+ long size = reader.getUInt32();
+ if (size <= 4L) {
+ break;
+ }
+ String kindName = reader.getString(4, StandardCharsets.ISO_8859_1);
+ if (LOCATION_CODE.equals(kindName)) {
+ int xyzLength = reader.getUInt16();
+ reader.skip(2L);
+ this.coordinateString = reader.getString(xyzLength, "UTF-8");
+ } else if ("meta".equals(kindName)) {
+ reader.getUInt32();
+ reader.getUInt32();
+ String hdlr = reader.getString(4, StandardCharsets.ISO_8859_1);
+ reader.getUInt32();
+ reader.getUInt32();
+ String subtype = reader.getString(4, StandardCharsets.ISO_8859_1);
+ // If the second and the fifth 32-bit integers encode 'hdlr' and 'mdta' respectively
+ // then the MetaBox is formatted according to QuickTime File Format.
+ // See https://developer.apple.com/library/content/documentation
+ // /QuickTime/QTFF/Metadata/Metadata.html
+ if (hdlr.equals("hdlr") && subtype.equals("mdta")) {
+ isQuickTime = true;
+ }
+ parseUserDataBox(reader, subtype);
+ } else {
+ if (size < 8L) {
+ return;
+ }
+
+ reader.skip(size - 8L);
+ }
+ }
+
+ }
+
+ private void parseUserDataBox(SequentialReader reader, String handlerType)
+ throws IOException, SAXException {
+ if (! "mdir".equals(handlerType)) {
+ return;
+ }
+ String mdirType = reader.getString(4, StandardCharsets.ISO_8859_1);
+
+ if ("appl".equals(mdirType)) {
+ reader.getString(10);//not sure what these bytes are
+ long len = reader.getUInt32();
+ if (len >= Integer.MAX_VALUE || len <= 0) {
+ //log
+ return;
+ }
+ String subType = reader.getString(4, StandardCharsets.ISO_8859_1);
+ if ("ilst".equals(subType)) {
+ processIList(reader, len);
+ }
+ }
+ }
+
+ private void processIList(SequentialReader reader, long totalLen)
+ throws IOException {
+
+ long totalRead = 0;
+ while (totalRead < totalLen) {
+ long recordLen = reader.getUInt32();
+ String fieldName = reader.getString(4, StandardCharsets.ISO_8859_1);
+ long fieldLen = reader.getUInt32();
+ String typeName = reader.getString(4, StandardCharsets.ISO_8859_1);//data
+ totalRead += 16;
+ if ("data".equals(typeName)) {
+ reader.skip(8);//not sure what these are
+ totalRead += 8;
+ int toRead = (int) fieldLen - 16;
+ if (toRead <= 0) {
+ //log?
+ return;
+ }
+ if ("covr".equals(fieldName)) {
+ //covr can be an image file, e.g. png or jpeg
+ //skip this for now
+ reader.skip(toRead);
+ } else if ("cpil".equals(fieldName)) {
+ int compilationId = (int)reader.getByte();
+ metadata.set(XMPDM.COMPILATION, compilationId);
+ } else if ("trkn".equals(fieldName)) {
+ if (toRead == 8) {
+ long numA = reader.getUInt32();
+ long numB = reader.getUInt32();
+ metadata.set(XMPDM.TRACK_NUMBER, (int)numA);
+ } else {
+ //log
+ reader.skip(toRead);
+ }
+ } else if ("disk".equals(fieldName)) {
+ int a = reader.getInt32();
+ short b = reader.getInt16();
+ metadata.set(XMPDM.DISC_NUMBER, a);
+ } else {
+ String val = reader.getString(toRead, StandardCharsets.UTF_8);
+ try {
+ addMetadata(fieldName, val);
+ } catch (SAXException e) {
+ //need to punch through IOException catching in MP4Reader
+ throw new RuntimeSAXException(e);
+ }
+ }
+
+ totalRead += toRead;
+ } else {
+ int toSkip = (int) recordLen - 16;
+ if (toSkip <= 0) {
+ //log?
+ return;
+ }
+ reader.skip(toSkip);
+ totalRead += toSkip;
+ }
+ }
+ }
+
+
+ private void addMetadata(String key, String value) throws SAXException {
+ switch (key) {
+ case "\u00A9nam":
+ metadata.set(TikaCoreProperties.TITLE, value);
+ xhtml.element("p", value);
+ break;
+ case "\u00A9too":
+ metadata.set(XMP.CREATOR_TOOL, value);
+ break;
+ case "\u00A9ART" :
+ metadata.set(XMPDM.ARTIST, value);
+ metadata.set(TikaCoreProperties.CREATOR, value);
+ xhtml.element("p", value);
+ break;
+ case "aART" :
+ metadata.set(XMPDM.ALBUM_ARTIST, value);
+ xhtml.element("p", value);
+ break;
+ case "\u00A9wrt":
+ metadata.set(XMPDM.COMPOSER, value);
+ xhtml.element("p", value);
+ break;
+ case "\u00A9alb":
+ metadata.set(XMPDM.ALBUM, value);
+ xhtml.element("p", value);
+ break;
+ case "\u00A9gen" :
+ metadata.set(XMPDM.GENRE, value);
+ xhtml.element("p", value);
+ break;
+ case "\u00A9day" :
+ //this can be a year "2008" or a date "2017-04-26T07:00:00Z"
+ metadata.set(XMPDM.RELEASE_DATE, value);
+ xhtml.element("p", value);
+ break;
+ case "\u00A9cmt" :
+ metadata.set(XMPDM.LOG_COMMENT, value);
+ xhtml.element("p", value);
+ break;
+ case "cprt" :
+ metadata.set(XMPDM.COPYRIGHT, value);
+ xhtml.element("p", value);
+ break;
+ case "xid " :
+ //not sure this is the right use of this key
+ metadata.set(XMP.IDENTIFIER, value);
+ break;
+ //purd date?
+ //xid ? e.g. SonyBMG:isrc:KRA031208874
+ //cprt copyright
+ //ownr ? and apID
+ //flvr ?
+ //son = nam, soal = (c)alb soar = aART?
+ //(C)ART
+ }
+ }
+
+ public void addMetadata(Mp4Directory directory) {
+ if (this.coordinateString != null) {
+ Matcher matcher = COORDINATE_PATTERN.matcher(this.coordinateString);
+ if (matcher.find()) {
+ double latitude = Double.parseDouble(matcher.group(1));
+ double longitude = Double.parseDouble(matcher.group(2));
+ directory.setDouble(8193, latitude);
+ directory.setDouble(8194, longitude);
+ }
+ }
+ }
+}
+
+
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/mp4/MP4ParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/mp4/LegacyMP4ParserTest.java
similarity index 93%
copy from tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/mp4/MP4ParserTest.java
copy to tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/mp4/LegacyMP4ParserTest.java
index ee9c9e4..f00394c 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/mp4/MP4ParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/mp4/LegacyMP4ParserTest.java
@@ -18,6 +18,8 @@ package org.apache.tika.parser.mp4;
import static org.junit.Assert.assertEquals;
+import java.util.Arrays;
+
import org.junit.Test;
import org.xml.sax.ContentHandler;
@@ -34,7 +36,7 @@ import org.apache.tika.sax.BodyContentHandler;
/**
* Test case for parsing mp4 files.
*/
-public class MP4ParserTest extends TikaTest {
+public class LegacyMP4ParserTest extends TikaTest {
/**
* Test that we can extract information from
* a M4A MP4 Audio file
@@ -42,7 +44,8 @@ public class MP4ParserTest extends TikaTest {
@Test
public void testMP4ParsingAudio() throws Exception {
Metadata metadata = new Metadata();
- String content = getText("testMP4.m4a", metadata);
+
+ String content = getText("testMP4.m4a", new LegacyMP4Parser(), metadata);
// Check core properties
assertEquals("audio/mp4", metadata.get(Metadata.CONTENT_TYPE));
@@ -79,6 +82,9 @@ public class MP4ParserTest extends TikaTest {
assertEquals("iTunes 10.5.3.3", metadata.get(XMP.CREATOR_TOOL));
+ assertContains("org.apache.tika.parser.mp4.LegacyMP4Parser",
+ Arrays.asList(metadata.getValues(TikaCoreProperties.TIKA_PARSED_BY)));
+
// Check again by file, rather than stream
TikaInputStream tstream =
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/mp4/MP4ParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/mp4/MP4ParserTest.java
index ee9c9e4..14dd97e 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/mp4/MP4ParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/test/java/org/apache/tika/parser/mp4/MP4ParserTest.java
@@ -18,11 +18,12 @@ package org.apache.tika.parser.mp4;
import static org.junit.Assert.assertEquals;
+import java.util.Arrays;
+
import org.junit.Test;
import org.xml.sax.ContentHandler;
import org.apache.tika.TikaTest;
-import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
@@ -79,6 +80,8 @@ public class MP4ParserTest extends TikaTest {
assertEquals("iTunes 10.5.3.3", metadata.get(XMP.CREATOR_TOOL));
+ assertContains("org.apache.tika.parser.mp4.MP4Parser",
+ Arrays.asList(metadata.getValues(TikaCoreProperties.TIKA_PARSED_BY)));
// Check again by file, rather than stream
TikaInputStream tstream =
@@ -97,14 +100,8 @@ public class MP4ParserTest extends TikaTest {
// TODO Test an old QuickTime Video File
@Test(timeout = 30000)
public void testInfiniteLoop() throws Exception {
- //test that a truncated mp4 doesn't cause an infinite loop
- //TIKA-1931 and TIKA-1924
- try {
- XMLResult r = getXML("testMP4_truncated.m4a");
- assertEquals("audio/mp4", r.metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("M4A", r.metadata.get(XMPDM.AUDIO_COMPRESSOR));
- } catch (TikaException e) {
- //java 11
- }
+ XMLResult r = getXML("testMP4_truncated.m4a");
+ assertEquals("audio/mp4", r.metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("M4A", r.metadata.get(XMPDM.AUDIO_COMPRESSOR));
}
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-image-module/pom.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-image-module/pom.xml
index 5bc6310..9e99b12 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-image-module/pom.xml
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-image-module/pom.xml
@@ -32,7 +32,7 @@
<dependency>
<groupId>com.drewnoakes</groupId>
<artifactId>metadata-extractor</artifactId>
- <version>2.16.0</version>
+ <version>${metadata.extractor.version}</version>
</dependency>
<dependency>
<groupId>${project.groupId}</groupId>