You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2014/07/24 21:19:27 UTC
svn commit: r1613249 - in /tika/trunk/tika-parsers: pom.xml
src/main/java/org/apache/tika/parser/mp4/DirectFileReadDataSource.java
src/main/java/org/apache/tika/parser/mp4/MP4Parser.java
Author: nick
Date: Thu Jul 24 19:19:26 2014
New Revision: 1613249
URL: http://svn.apache.org/r1613249
Log:
Patch from Matthias Krueger from TIKA-1361 - Upgrade MP4Parser to 1.0.2, add a custom Data Source and use that for explicit temp handling. This closes #14 from Github
Added:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp4/DirectFileReadDataSource.java
Modified:
tika/trunk/tika-parsers/pom.xml
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp4/MP4Parser.java
Modified: tika/trunk/tika-parsers/pom.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/pom.xml?rev=1613249&r1=1613248&r2=1613249&view=diff
==============================================================================
--- tika/trunk/tika-parsers/pom.xml (original)
+++ tika/trunk/tika-parsers/pom.xml Thu Jul 24 19:19:26 2014
@@ -164,7 +164,7 @@
<dependency>
<groupId>com.googlecode.mp4parser</groupId>
<artifactId>isoparser</artifactId>
- <version>1.0-RC-1</version>
+ <version>1.0.2</version>
</dependency>
<dependency>
<groupId>com.drewnoakes</groupId>
Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp4/DirectFileReadDataSource.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp4/DirectFileReadDataSource.java?rev=1613249&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp4/DirectFileReadDataSource.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp4/DirectFileReadDataSource.java Thu Jul 24 19:19:26 2014
@@ -0,0 +1,84 @@
+package org.apache.tika.parser.mp4;
+
+import com.googlecode.mp4parser.DataSource;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.RandomAccessFile;
+import java.nio.ByteBuffer;
+import java.nio.channels.WritableByteChannel;
+
+import static com.googlecode.mp4parser.util.CastUtils.l2i;
+
+/**
+ * A {@link DataSource} implementation that relies on direct reads from a {@link RandomAccessFile}.
+ * It should be slower than {@link com.googlecode.mp4parser.FileDataSourceImpl} but does not incur the implicit file locks of
+ * memory mapped I/O on some JVMs. This implementation allows for a more controlled deletion of files
+ * and might be preferred when working with temporary files.
+ * @see <a href="http://bugs.java.com/view_bug.do?bug_id=4724038">JDK-4724038 : (fs) Add unmap method to MappedByteBuffer</a>
+ * @see <a href="http://bugs.java.com/view_bug.do?bug_id=6359560">JDK-6359560 : (fs) File.deleteOnExit() doesn't work when MappedByteBuffer exists (win)</a>
+ */
+public class DirectFileReadDataSource implements DataSource {
+
+ private static final int TRANSFER_SIZE = 8192;
+
+ private RandomAccessFile raf;
+
+ public DirectFileReadDataSource(File f) throws IOException {
+ this.raf = new RandomAccessFile(f, "r");
+ }
+
+ public int read(ByteBuffer byteBuffer) throws IOException {
+ int len = byteBuffer.remaining();
+ int totalRead = 0;
+ int bytesRead = 0;
+ byte[] buf = new byte[TRANSFER_SIZE];
+ while (totalRead < len) {
+ int bytesToRead = Math.min((len - totalRead), TRANSFER_SIZE);
+ bytesRead = raf.read(buf, 0, bytesToRead);
+ if (bytesRead < 0) {
+ break;
+ } else {
+ totalRead += bytesRead;
+ }
+ byteBuffer.put(buf, 0, bytesRead);
+ }
+ return ((bytesRead < 0) && (totalRead == 0)) ? -1 : totalRead;
+ }
+
+ public int readAllInOnce(ByteBuffer byteBuffer) throws IOException {
+ byte[] buf = new byte[byteBuffer.remaining()];
+ int read = raf.read(buf);
+ byteBuffer.put(buf, 0, read);
+ return read;
+ }
+
+ public long size() throws IOException {
+ return raf.length();
+ }
+
+ public long position() throws IOException {
+ return raf.getFilePointer();
+ }
+
+ public void position(long nuPos) throws IOException {
+ raf.seek(nuPos);
+ }
+
+ public long transferTo(long position, long count, WritableByteChannel target) throws IOException {
+ return target.write(map(position, count));
+ }
+
+ public ByteBuffer map(long startPosition, long size) throws IOException {
+ raf.seek(startPosition);
+ byte[] payload = new byte[l2i(size)];
+ raf.readFully(payload);
+ return ByteBuffer.wrap(payload);
+ }
+
+ public void close() throws IOException {
+ raf.close();
+ }
+
+
+}
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp4/MP4Parser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp4/MP4Parser.java?rev=1613249&r1=1613248&r2=1613249&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp4/MP4Parser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp4/MP4Parser.java Thu Jul 24 19:19:26 2014
@@ -16,17 +16,13 @@
*/
package org.apache.tika.parser.mp4;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.Date;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-
+import com.coremedia.iso.IsoFile;
+import com.coremedia.iso.boxes.*;
+import com.coremedia.iso.boxes.apple.AppleItemListBox;
+import com.coremedia.iso.boxes.sampleentry.AudioSampleEntry;
+import com.googlecode.mp4parser.boxes.apple.*;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
@@ -39,31 +35,9 @@ import org.apache.tika.sax.XHTMLContentH
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
-import com.coremedia.iso.IsoFile;
-import com.coremedia.iso.boxes.Box;
-import com.coremedia.iso.boxes.ContainerBox;
-import com.coremedia.iso.boxes.FileTypeBox;
-import com.coremedia.iso.boxes.MetaBox;
-import com.coremedia.iso.boxes.MovieBox;
-import com.coremedia.iso.boxes.MovieHeaderBox;
-import com.coremedia.iso.boxes.SampleDescriptionBox;
-import com.coremedia.iso.boxes.SampleTableBox;
-import com.coremedia.iso.boxes.TrackBox;
-import com.coremedia.iso.boxes.TrackHeaderBox;
-import com.coremedia.iso.boxes.UserDataBox;
-import com.coremedia.iso.boxes.apple.AbstractAppleMetaDataBox;
-import com.coremedia.iso.boxes.apple.AppleAlbumBox;
-import com.coremedia.iso.boxes.apple.AppleArtistBox;
-import com.coremedia.iso.boxes.apple.AppleCommentBox;
-import com.coremedia.iso.boxes.apple.AppleCustomGenreBox;
-import com.coremedia.iso.boxes.apple.AppleEncoderBox;
-import com.coremedia.iso.boxes.apple.AppleItemListBox;
-import com.coremedia.iso.boxes.apple.AppleRecordingYearBox;
-import com.coremedia.iso.boxes.apple.AppleStandardGenreBox;
-import com.coremedia.iso.boxes.apple.AppleTrackAuthorBox;
-import com.coremedia.iso.boxes.apple.AppleTrackNumberBox;
-import com.coremedia.iso.boxes.apple.AppleTrackTitleBox;
-import com.coremedia.iso.boxes.sampleentry.AudioSampleEntry;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.*;
/**
* Parser for the MP4 media container format, as well as the older
@@ -112,197 +86,178 @@ public class MP4Parser extends AbstractP
// The MP4Parser library accepts either a File, or a byte array
// As MP4 video files are typically large, always use a file to
// avoid OOMs that may occur with in-memory buffering
- TikaInputStream tstream = TikaInputStream.get(stream);
+ TemporaryResources tmp = new TemporaryResources();
+ TikaInputStream tstream = TikaInputStream.get(stream, tmp);
try {
- isoFile = new IsoFile(tstream.getFileChannel());
- } finally {
- tstream.close();
- }
-
-
- // Grab the file type box
- FileTypeBox fileType = getOrNull(isoFile, FileTypeBox.class);
- if (fileType != null) {
- // Identify the type
- MediaType type = MediaType.application("mp4");
- for (MediaType t : typesMap.keySet()) {
- if (typesMap.get(t).contains(fileType.getMajorBrand())) {
- type = t;
- break;
- }
- }
- metadata.set(Metadata.CONTENT_TYPE, type.toString());
-
- if (type.getType().equals("audio")) {
- metadata.set(XMPDM.AUDIO_COMPRESSOR, fileType.getMajorBrand().trim());
- }
- } else {
- // Some older QuickTime files lack the FileType
- metadata.set(Metadata.CONTENT_TYPE, "video/quicktime");
- }
-
-
- // Get the main MOOV box
- MovieBox moov = getOrNull(isoFile, MovieBox.class);
- if (moov == null) {
- // Bail out
- return;
- }
+ isoFile = new IsoFile(new DirectFileReadDataSource(tstream.getFile()));
+ tmp.addResource(isoFile);
-
- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
- xhtml.startDocument();
-
-
- // Pull out some information from the header box
- MovieHeaderBox mHeader = getOrNull(moov, MovieHeaderBox.class);
- if (mHeader != null) {
- // Get the creation and modification dates
- metadata.set(
- Metadata.CREATION_DATE,
- MP4TimeToDate(mHeader.getCreationTime())
- );
- metadata.set(
- TikaCoreProperties.MODIFIED,
- MP4TimeToDate(mHeader.getModificationTime())
- );
-
- // Get the duration
- double durationSeconds = ((double)mHeader.getDuration()) / mHeader.getTimescale();
- // TODO Use this
-
- // The timescale is normally the sampling rate
- metadata.set(XMPDM.AUDIO_SAMPLE_RATE, (int)mHeader.getTimescale());
- }
-
-
- // Get some more information from the track header
- // TODO Decide how to handle multiple tracks
- List<TrackBox> tb = moov.getBoxes(TrackBox.class);
- if (tb.size() > 0) {
- TrackBox track = tb.get(0);
-
- TrackHeaderBox header = track.getTrackHeaderBox();
- // Get the creation and modification dates
- metadata.set(
- TikaCoreProperties.CREATED,
- MP4TimeToDate(header.getCreationTime())
- );
- metadata.set(
- TikaCoreProperties.MODIFIED,
- MP4TimeToDate(header.getModificationTime())
- );
-
- // Get the video with and height
- metadata.set(Metadata.IMAGE_WIDTH, (int)header.getWidth());
- metadata.set(Metadata.IMAGE_LENGTH, (int)header.getHeight());
-
- // Get the sample information
- SampleTableBox samples = track.getSampleTableBox();
- SampleDescriptionBox sampleDesc = samples.getSampleDescriptionBox();
- if (sampleDesc != null) {
- // Look for the first Audio Sample, if present
- AudioSampleEntry sample = getOrNull(sampleDesc, AudioSampleEntry.class);
- if (sample != null) {
- XMPDM.ChannelTypePropertyConverter.convertAndSet(metadata, sample.getChannelCount());
- //metadata.set(XMPDM.AUDIO_SAMPLE_TYPE, sample.getSampleSize()); // TODO Num -> Type mapping
- metadata.set(XMPDM.AUDIO_SAMPLE_RATE, (int)sample.getSampleRate());
- //metadata.set(XMPDM.AUDIO_, sample.getSamplesPerPacket());
- //metadata.set(XMPDM.AUDIO_, sample.getBytesPerSample());
- }
- }
- }
-
- // Get metadata from the User Data Box
- UserDataBox userData = getOrNull(moov, UserDataBox.class);
- if (userData != null) {
- MetaBox meta = getOrNull(userData, MetaBox.class);
-
- // Check for iTunes Metadata
- // See http://atomicparsley.sourceforge.net/mpeg-4files.html and
- // http://code.google.com/p/mp4v2/wiki/iTunesMetadata for more on these
- AppleItemListBox apple = getOrNull(meta, AppleItemListBox.class);
- if (apple != null) {
- // Title
- AppleTrackTitleBox title = getOrNull(apple, AppleTrackTitleBox.class);
- addMetadata(TikaCoreProperties.TITLE, metadata, title);
-
- // Artist
- AppleArtistBox artist = getOrNull(apple, AppleArtistBox.class);
- addMetadata(TikaCoreProperties.CREATOR, metadata, artist);
- addMetadata(XMPDM.ARTIST, metadata, artist);
-
- // Album
- AppleAlbumBox album = getOrNull(apple, AppleAlbumBox.class);
- addMetadata(XMPDM.ALBUM, metadata, album);
-
- // Composer
- AppleTrackAuthorBox composer = getOrNull(apple, AppleTrackAuthorBox.class);
- addMetadata(XMPDM.COMPOSER, metadata, composer);
-
- // Genre
- AppleStandardGenreBox sGenre = getOrNull(apple, AppleStandardGenreBox.class);
- AppleCustomGenreBox cGenre = getOrNull(apple, AppleCustomGenreBox.class);
- addMetadata(XMPDM.GENRE, metadata, sGenre);
- addMetadata(XMPDM.GENRE, metadata, cGenre);
-
- // Year
- AppleRecordingYearBox year = getOrNull(apple, AppleRecordingYearBox.class);
- addMetadata(XMPDM.RELEASE_DATE, metadata, year);
-
- // Track number
- AppleTrackNumberBox trackNum = getOrNull(apple, AppleTrackNumberBox.class);
- if (trackNum != null) {
- metadata.set(XMPDM.TRACK_NUMBER, trackNum.getTrackNumber());
- //metadata.set(XMPDM.NUMBER_OF_TRACKS, trackNum.getNumberOfTracks()); // TODO
- }
-
- // Comment
- AppleCommentBox comment = getOrNull(apple, AppleCommentBox.class);
- addMetadata(XMPDM.LOG_COMMENT, metadata, comment);
-
- // Encoder
- AppleEncoderBox encoder = getOrNull(apple, AppleEncoderBox.class);
- // addMetadata(XMPDM.???, metadata, encoder); // TODO
-
-
- // As text
- for (Box box : apple.getBoxes()) {
- if (box instanceof AbstractAppleMetaDataBox) {
- xhtml.element("p", ((AbstractAppleMetaDataBox)box).getValue());
- }
- }
- }
-
- // TODO Check for other kinds too
+ // Grab the file type box
+ FileTypeBox fileType = getOrNull(isoFile, FileTypeBox.class);
+ if (fileType != null) {
+ // Identify the type
+ MediaType type = MediaType.application("mp4");
+ for (MediaType t : typesMap.keySet()) {
+ if (typesMap.get(t).contains(fileType.getMajorBrand())) {
+ type = t;
+ break;
+ }
+ }
+ metadata.set(Metadata.CONTENT_TYPE, type.toString());
+
+ if (type.getType().equals("audio")) {
+ metadata.set(XMPDM.AUDIO_COMPRESSOR, fileType.getMajorBrand().trim());
+ }
+ } else {
+ // Some older QuickTime files lack the FileType
+ metadata.set(Metadata.CONTENT_TYPE, "video/quicktime");
+ }
+
+
+ // Get the main MOOV box
+ MovieBox moov = getOrNull(isoFile, MovieBox.class);
+ if (moov == null) {
+ // Bail out
+ return;
+ }
+
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+
+
+ // Pull out some information from the header box
+ MovieHeaderBox mHeader = getOrNull(moov, MovieHeaderBox.class);
+ if (mHeader != null) {
+ // Get the creation and modification dates
+ metadata.set(Metadata.CREATION_DATE, mHeader.getCreationTime());
+ metadata.set(TikaCoreProperties.MODIFIED, mHeader.getModificationTime());
+
+ // Get the duration
+ double durationSeconds = ((double)mHeader.getDuration()) / mHeader.getTimescale();
+ // TODO Use this
+
+ // The timescale is normally the sampling rate
+ metadata.set(XMPDM.AUDIO_SAMPLE_RATE, (int)mHeader.getTimescale());
+ }
+
+
+ // Get some more information from the track header
+ // TODO Decide how to handle multiple tracks
+ List<TrackBox> tb = moov.getBoxes(TrackBox.class);
+ if (tb.size() > 0) {
+ TrackBox track = tb.get(0);
+
+ TrackHeaderBox header = track.getTrackHeaderBox();
+ // Get the creation and modification dates
+ metadata.set(TikaCoreProperties.CREATED, header.getCreationTime());
+ metadata.set(TikaCoreProperties.MODIFIED, header.getModificationTime());
+
+ // Get the video with and height
+ metadata.set(Metadata.IMAGE_WIDTH, (int)header.getWidth());
+ metadata.set(Metadata.IMAGE_LENGTH, (int)header.getHeight());
+
+ // Get the sample information
+ SampleTableBox samples = track.getSampleTableBox();
+ SampleDescriptionBox sampleDesc = samples.getSampleDescriptionBox();
+ if (sampleDesc != null) {
+ // Look for the first Audio Sample, if present
+ AudioSampleEntry sample = getOrNull(sampleDesc, AudioSampleEntry.class);
+ if (sample != null) {
+ XMPDM.ChannelTypePropertyConverter.convertAndSet(metadata, sample.getChannelCount());
+ //metadata.set(XMPDM.AUDIO_SAMPLE_TYPE, sample.getSampleSize()); // TODO Num -> Type mapping
+ metadata.set(XMPDM.AUDIO_SAMPLE_RATE, (int)sample.getSampleRate());
+ //metadata.set(XMPDM.AUDIO_, sample.getSamplesPerPacket());
+ //metadata.set(XMPDM.AUDIO_, sample.getBytesPerSample());
+ }
+ }
+ }
+
+ // Get metadata from the User Data Box
+ UserDataBox userData = getOrNull(moov, UserDataBox.class);
+ if (userData != null) {
+ MetaBox meta = getOrNull(userData, MetaBox.class);
+
+ // Check for iTunes Metadata
+ // See http://atomicparsley.sourceforge.net/mpeg-4files.html and
+ // http://code.google.com/p/mp4v2/wiki/iTunesMetadata for more on these
+ AppleItemListBox apple = getOrNull(meta, AppleItemListBox.class);
+ if (apple != null) {
+ // Title
+ AppleNameBox title = getOrNull(apple, AppleNameBox.class);
+ addMetadata(TikaCoreProperties.TITLE, metadata, title);
+
+ // Artist
+ AppleArtistBox artist = getOrNull(apple, AppleArtistBox.class);
+ addMetadata(TikaCoreProperties.CREATOR, metadata, artist);
+ addMetadata(XMPDM.ARTIST, metadata, artist);
+
+ // Album
+ AppleAlbumBox album = getOrNull(apple, AppleAlbumBox.class);
+ addMetadata(XMPDM.ALBUM, metadata, album);
+
+ // Composer
+ AppleTrackAuthorBox composer = getOrNull(apple, AppleTrackAuthorBox.class);
+ addMetadata(XMPDM.COMPOSER, metadata, composer);
+
+ // Genre
+ AppleGenreBox genre = getOrNull(apple, AppleGenreBox.class);
+ addMetadata(XMPDM.GENRE, metadata, genre);
+
+ // Year
+ AppleRecordingYear2Box year = getOrNull(apple, AppleRecordingYear2Box.class);
+ if (year != null) {
+ metadata.set(XMPDM.RELEASE_DATE, year.getValue());
+ }
+
+ // Track number
+ AppleTrackNumberBox trackNum = getOrNull(apple, AppleTrackNumberBox.class);
+ if (trackNum != null) {
+ metadata.set(XMPDM.TRACK_NUMBER, trackNum.getA());
+ //metadata.set(XMPDM.NUMBER_OF_TRACKS, trackNum.getB()); // TODO
+ }
+
+ // Comment
+ AppleCommentBox comment = getOrNull(apple, AppleCommentBox.class);
+ addMetadata(XMPDM.LOG_COMMENT, metadata, comment);
+
+ // Encoder
+ AppleEncoderBox encoder = getOrNull(apple, AppleEncoderBox.class);
+ // addMetadata(XMPDM.???, metadata, encoder); // TODO
+
+
+ // As text
+ for (Box box : apple.getBoxes()) {
+ if (box instanceof Utf8AppleDataBox) {
+ xhtml.element("p", ((Utf8AppleDataBox)box).getValue());
+ }
+ }
+ }
+
+ // TODO Check for other kinds too
+ }
+
+ // All done
+ xhtml.endDocument();
+
+ } finally {
+ tmp.dispose();
}
- // All done
- xhtml.endDocument();
}
- private static void addMetadata(String key, Metadata m, AbstractAppleMetaDataBox metadata) {
+ private static void addMetadata(String key, Metadata m, Utf8AppleDataBox metadata) {
if (metadata != null) {
m.add(key, metadata.getValue());
}
}
- private static void addMetadata(Property prop, Metadata m, AbstractAppleMetaDataBox metadata) {
+ private static void addMetadata(Property prop, Metadata m, Utf8AppleDataBox metadata) {
if (metadata != null) {
m.set(prop, metadata.getValue());
}
}
- /**
- * MP4 Dates are stored as 32-bit integer, which represent the seconds
- * since midnight, January 1, 1904, and are generally in UTC
- */
- private static Date MP4TimeToDate(long mp4Time) {
- long unix = mp4Time - EPOC_AS_MP4_TIME;
- return new Date(unix*1000);
- }
- private static final long EPOC_AS_MP4_TIME = 2082844800l;
-
- private static <T extends Box> T getOrNull(ContainerBox box, Class<T> clazz) {
+ private static <T extends Box> T getOrNull(Container box, Class<T> clazz) {
if (box == null) return null;
List<T> boxes = box.getBoxes(clazz);