You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2014/07/24 21:19:27 UTC

svn commit: r1613249 - in /tika/trunk/tika-parsers: pom.xml src/main/java/org/apache/tika/parser/mp4/DirectFileReadDataSource.java src/main/java/org/apache/tika/parser/mp4/MP4Parser.java

Author: nick
Date: Thu Jul 24 19:19:26 2014
New Revision: 1613249

URL: http://svn.apache.org/r1613249
Log:
Patch from Matthias Krueger from TIKA-1361 - Upgrade MP4Parser to 1.0.2, add a custom Data Source and use that for explicit temp handling. This closes #14 from Github

Added:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp4/DirectFileReadDataSource.java
Modified:
    tika/trunk/tika-parsers/pom.xml
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp4/MP4Parser.java

Modified: tika/trunk/tika-parsers/pom.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/pom.xml?rev=1613249&r1=1613248&r2=1613249&view=diff
==============================================================================
--- tika/trunk/tika-parsers/pom.xml (original)
+++ tika/trunk/tika-parsers/pom.xml Thu Jul 24 19:19:26 2014
@@ -164,7 +164,7 @@
     <dependency>
       <groupId>com.googlecode.mp4parser</groupId>
       <artifactId>isoparser</artifactId>
-      <version>1.0-RC-1</version>
+      <version>1.0.2</version>
     </dependency>
     <dependency>
        <groupId>com.drewnoakes</groupId>

Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp4/DirectFileReadDataSource.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp4/DirectFileReadDataSource.java?rev=1613249&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp4/DirectFileReadDataSource.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp4/DirectFileReadDataSource.java Thu Jul 24 19:19:26 2014
@@ -0,0 +1,84 @@
+package org.apache.tika.parser.mp4;
+
+import com.googlecode.mp4parser.DataSource;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.RandomAccessFile;
+import java.nio.ByteBuffer;
+import java.nio.channels.WritableByteChannel;
+
+import static com.googlecode.mp4parser.util.CastUtils.l2i;
+
+/**
+ * A {@link DataSource} implementation that relies on direct reads from a {@link RandomAccessFile}.
+ * It should be slower than {@link com.googlecode.mp4parser.FileDataSourceImpl} but does not incur the implicit file locks of
+ * memory mapped I/O on some JVMs. This implementation allows for a more controlled deletion of files
+ * and might be preferred when working with temporary files.
+ * @see <a href="http://bugs.java.com/view_bug.do?bug_id=4724038">JDK-4724038 : (fs) Add unmap method to MappedByteBuffer</a>
+ * @see <a href="http://bugs.java.com/view_bug.do?bug_id=6359560">JDK-6359560 : (fs) File.deleteOnExit() doesn't work when MappedByteBuffer exists (win)</a>
+ */
+public class DirectFileReadDataSource implements DataSource {
+
+    private static final int TRANSFER_SIZE = 8192;
+
+    private RandomAccessFile raf;
+
+    public DirectFileReadDataSource(File f) throws IOException {
+        this.raf = new RandomAccessFile(f, "r");
+    }
+
+    public int read(ByteBuffer byteBuffer) throws IOException {
+        int len = byteBuffer.remaining();
+        int totalRead = 0;
+        int bytesRead = 0;
+        byte[] buf = new byte[TRANSFER_SIZE];
+        while (totalRead < len) {
+            int bytesToRead = Math.min((len - totalRead), TRANSFER_SIZE);
+            bytesRead = raf.read(buf, 0, bytesToRead);
+            if (bytesRead < 0) {
+                break;
+            } else {
+                totalRead += bytesRead;
+            }
+            byteBuffer.put(buf, 0, bytesRead);
+        }
+        return ((bytesRead < 0) && (totalRead == 0)) ? -1 : totalRead;
+    }
+
+    public int readAllInOnce(ByteBuffer byteBuffer) throws IOException {
+        byte[] buf = new byte[byteBuffer.remaining()];
+        int read = raf.read(buf);
+        byteBuffer.put(buf, 0, read);
+        return read;
+    }
+
+    public long size() throws IOException {
+        return raf.length();
+    }
+
+    public long position() throws IOException {
+        return raf.getFilePointer();
+    }
+
+    public void position(long nuPos) throws IOException {
+        raf.seek(nuPos);
+    }
+
+    public long transferTo(long position, long count, WritableByteChannel target) throws IOException {
+        return target.write(map(position, count));
+    }
+
+    public ByteBuffer map(long startPosition, long size) throws IOException {
+        raf.seek(startPosition);
+        byte[] payload = new byte[l2i(size)];
+        raf.readFully(payload);
+        return ByteBuffer.wrap(payload);
+    }
+
+    public void close() throws IOException {
+        raf.close();
+    }
+
+
+}

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp4/MP4Parser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp4/MP4Parser.java?rev=1613249&r1=1613248&r2=1613249&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp4/MP4Parser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp4/MP4Parser.java Thu Jul 24 19:19:26 2014
@@ -16,17 +16,13 @@
  */
 package org.apache.tika.parser.mp4;
 
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.Date;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-
+import com.coremedia.iso.IsoFile;
+import com.coremedia.iso.boxes.*;
+import com.coremedia.iso.boxes.apple.AppleItemListBox;
+import com.coremedia.iso.boxes.sampleentry.AudioSampleEntry;
+import com.googlecode.mp4parser.boxes.apple.*;
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryResources;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.Property;
@@ -39,31 +35,9 @@ import org.apache.tika.sax.XHTMLContentH
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
-import com.coremedia.iso.IsoFile;
-import com.coremedia.iso.boxes.Box;
-import com.coremedia.iso.boxes.ContainerBox;
-import com.coremedia.iso.boxes.FileTypeBox;
-import com.coremedia.iso.boxes.MetaBox;
-import com.coremedia.iso.boxes.MovieBox;
-import com.coremedia.iso.boxes.MovieHeaderBox;
-import com.coremedia.iso.boxes.SampleDescriptionBox;
-import com.coremedia.iso.boxes.SampleTableBox;
-import com.coremedia.iso.boxes.TrackBox;
-import com.coremedia.iso.boxes.TrackHeaderBox;
-import com.coremedia.iso.boxes.UserDataBox;
-import com.coremedia.iso.boxes.apple.AbstractAppleMetaDataBox;
-import com.coremedia.iso.boxes.apple.AppleAlbumBox;
-import com.coremedia.iso.boxes.apple.AppleArtistBox;
-import com.coremedia.iso.boxes.apple.AppleCommentBox;
-import com.coremedia.iso.boxes.apple.AppleCustomGenreBox;
-import com.coremedia.iso.boxes.apple.AppleEncoderBox;
-import com.coremedia.iso.boxes.apple.AppleItemListBox;
-import com.coremedia.iso.boxes.apple.AppleRecordingYearBox;
-import com.coremedia.iso.boxes.apple.AppleStandardGenreBox;
-import com.coremedia.iso.boxes.apple.AppleTrackAuthorBox;
-import com.coremedia.iso.boxes.apple.AppleTrackNumberBox;
-import com.coremedia.iso.boxes.apple.AppleTrackTitleBox;
-import com.coremedia.iso.boxes.sampleentry.AudioSampleEntry;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.*;
 
 /**
  * Parser for the MP4 media container format, as well as the older
@@ -112,197 +86,178 @@ public class MP4Parser extends AbstractP
         // The MP4Parser library accepts either a File, or a byte array
         // As MP4 video files are typically large, always use a file to
         //  avoid OOMs that may occur with in-memory buffering
-        TikaInputStream tstream = TikaInputStream.get(stream);
+        TemporaryResources tmp = new TemporaryResources();
+        TikaInputStream tstream = TikaInputStream.get(stream, tmp);
         try {
-           isoFile = new IsoFile(tstream.getFileChannel());
-        } finally {
-           tstream.close();
-        }
-        
-        
-        // Grab the file type box
-        FileTypeBox fileType = getOrNull(isoFile, FileTypeBox.class);
-        if (fileType != null) {
-           // Identify the type
-           MediaType type = MediaType.application("mp4");
-           for (MediaType t : typesMap.keySet()) {
-              if (typesMap.get(t).contains(fileType.getMajorBrand())) {
-                 type = t;
-                 break;
-              }
-           }
-           metadata.set(Metadata.CONTENT_TYPE, type.toString());
-           
-           if (type.getType().equals("audio")) {
-              metadata.set(XMPDM.AUDIO_COMPRESSOR, fileType.getMajorBrand().trim());
-           }
-        } else {
-           // Some older QuickTime files lack the FileType
-           metadata.set(Metadata.CONTENT_TYPE, "video/quicktime");
-        }
-        
-        
-        // Get the main MOOV box
-        MovieBox moov = getOrNull(isoFile, MovieBox.class);
-        if (moov == null) {
-           // Bail out
-           return;
-        }
+            isoFile = new IsoFile(new DirectFileReadDataSource(tstream.getFile()));
+            tmp.addResource(isoFile);
 
-        
-        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
-        xhtml.startDocument();
-        
-        
-        // Pull out some information from the header box
-        MovieHeaderBox mHeader = getOrNull(moov, MovieHeaderBox.class);
-        if (mHeader != null) {
-           // Get the creation and modification dates
-           metadata.set(
-                 Metadata.CREATION_DATE, 
-                 MP4TimeToDate(mHeader.getCreationTime())
-           );
-           metadata.set(
-                 TikaCoreProperties.MODIFIED,
-                 MP4TimeToDate(mHeader.getModificationTime())
-           );
-           
-           // Get the duration
-           double durationSeconds = ((double)mHeader.getDuration()) / mHeader.getTimescale();
-           // TODO Use this
-           
-           // The timescale is normally the sampling rate
-           metadata.set(XMPDM.AUDIO_SAMPLE_RATE, (int)mHeader.getTimescale());
-        }
-        
-        
-        // Get some more information from the track header
-        // TODO Decide how to handle multiple tracks
-        List<TrackBox> tb = moov.getBoxes(TrackBox.class);
-        if (tb.size() > 0) {
-           TrackBox track = tb.get(0);
-           
-           TrackHeaderBox header = track.getTrackHeaderBox();
-           // Get the creation and modification dates
-           metadata.set(
-                 TikaCoreProperties.CREATED, 
-                 MP4TimeToDate(header.getCreationTime())
-           );
-           metadata.set(
-                 TikaCoreProperties.MODIFIED,
-                 MP4TimeToDate(header.getModificationTime())
-           );
-           
-           // Get the video with and height
-           metadata.set(Metadata.IMAGE_WIDTH,  (int)header.getWidth());
-           metadata.set(Metadata.IMAGE_LENGTH, (int)header.getHeight());
-           
-           // Get the sample information
-           SampleTableBox samples = track.getSampleTableBox();
-           SampleDescriptionBox sampleDesc = samples.getSampleDescriptionBox();
-           if (sampleDesc != null) {
-              // Look for the first Audio Sample, if present
-              AudioSampleEntry sample = getOrNull(sampleDesc, AudioSampleEntry.class);
-              if (sample != null) {
-                 XMPDM.ChannelTypePropertyConverter.convertAndSet(metadata, sample.getChannelCount());
-                 //metadata.set(XMPDM.AUDIO_SAMPLE_TYPE, sample.getSampleSize());    // TODO Num -> Type mapping
-                 metadata.set(XMPDM.AUDIO_SAMPLE_RATE, (int)sample.getSampleRate());
-                 //metadata.set(XMPDM.AUDIO_, sample.getSamplesPerPacket());
-                 //metadata.set(XMPDM.AUDIO_, sample.getBytesPerSample());
-              }
-           }
-        }
-        
-        // Get metadata from the User Data Box
-        UserDataBox userData = getOrNull(moov, UserDataBox.class);
-        if (userData != null) {
-           MetaBox meta = getOrNull(userData, MetaBox.class);
-
-           // Check for iTunes Metadata
-           // See http://atomicparsley.sourceforge.net/mpeg-4files.html and
-           //  http://code.google.com/p/mp4v2/wiki/iTunesMetadata for more on these
-           AppleItemListBox apple = getOrNull(meta, AppleItemListBox.class);
-           if (apple != null) {
-              // Title
-              AppleTrackTitleBox title = getOrNull(apple, AppleTrackTitleBox.class);
-              addMetadata(TikaCoreProperties.TITLE, metadata, title);
-
-              // Artist
-              AppleArtistBox artist = getOrNull(apple, AppleArtistBox.class);
-              addMetadata(TikaCoreProperties.CREATOR, metadata, artist);
-              addMetadata(XMPDM.ARTIST, metadata, artist);
-              
-              // Album
-              AppleAlbumBox album = getOrNull(apple, AppleAlbumBox.class);
-              addMetadata(XMPDM.ALBUM, metadata, album);
-              
-              // Composer
-              AppleTrackAuthorBox composer = getOrNull(apple, AppleTrackAuthorBox.class);
-              addMetadata(XMPDM.COMPOSER, metadata, composer);
-              
-              // Genre
-              AppleStandardGenreBox sGenre = getOrNull(apple, AppleStandardGenreBox.class);
-              AppleCustomGenreBox   cGenre = getOrNull(apple, AppleCustomGenreBox.class);
-              addMetadata(XMPDM.GENRE, metadata, sGenre);
-              addMetadata(XMPDM.GENRE, metadata, cGenre);
-              
-              // Year
-              AppleRecordingYearBox year = getOrNull(apple, AppleRecordingYearBox.class);
-              addMetadata(XMPDM.RELEASE_DATE, metadata, year);
-              
-              // Track number 
-              AppleTrackNumberBox trackNum = getOrNull(apple, AppleTrackNumberBox.class);
-              if (trackNum != null) {
-                 metadata.set(XMPDM.TRACK_NUMBER, trackNum.getTrackNumber());
-                 //metadata.set(XMPDM.NUMBER_OF_TRACKS, trackNum.getNumberOfTracks()); // TODO
-              }
-              
-              // Comment
-              AppleCommentBox comment = getOrNull(apple, AppleCommentBox.class);
-              addMetadata(XMPDM.LOG_COMMENT, metadata, comment);
-              
-              // Encoder
-              AppleEncoderBox encoder = getOrNull(apple, AppleEncoderBox.class);
-              // addMetadata(XMPDM.???, metadata, encoder); // TODO
-              
-              
-              // As text
-              for (Box box : apple.getBoxes()) {
-                 if (box instanceof AbstractAppleMetaDataBox) {
-                    xhtml.element("p", ((AbstractAppleMetaDataBox)box).getValue());
-                 }
-              }
-           }
-           
-           // TODO Check for other kinds too
+            // Grab the file type box
+            FileTypeBox fileType = getOrNull(isoFile, FileTypeBox.class);
+            if (fileType != null) {
+               // Identify the type
+               MediaType type = MediaType.application("mp4");
+               for (MediaType t : typesMap.keySet()) {
+                  if (typesMap.get(t).contains(fileType.getMajorBrand())) {
+                     type = t;
+                     break;
+                  }
+               }
+               metadata.set(Metadata.CONTENT_TYPE, type.toString());
+
+               if (type.getType().equals("audio")) {
+                  metadata.set(XMPDM.AUDIO_COMPRESSOR, fileType.getMajorBrand().trim());
+               }
+            } else {
+               // Some older QuickTime files lack the FileType
+               metadata.set(Metadata.CONTENT_TYPE, "video/quicktime");
+            }
+
+
+            // Get the main MOOV box
+            MovieBox moov = getOrNull(isoFile, MovieBox.class);
+            if (moov == null) {
+               // Bail out
+               return;
+            }
+
+
+            XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+            xhtml.startDocument();
+
+
+            // Pull out some information from the header box
+            MovieHeaderBox mHeader = getOrNull(moov, MovieHeaderBox.class);
+            if (mHeader != null) {
+               // Get the creation and modification dates
+               metadata.set(Metadata.CREATION_DATE, mHeader.getCreationTime());
+               metadata.set(TikaCoreProperties.MODIFIED, mHeader.getModificationTime());
+
+               // Get the duration
+               double durationSeconds = ((double)mHeader.getDuration()) / mHeader.getTimescale();
+               // TODO Use this
+
+               // The timescale is normally the sampling rate
+               metadata.set(XMPDM.AUDIO_SAMPLE_RATE, (int)mHeader.getTimescale());
+            }
+
+
+            // Get some more information from the track header
+            // TODO Decide how to handle multiple tracks
+            List<TrackBox> tb = moov.getBoxes(TrackBox.class);
+            if (tb.size() > 0) {
+               TrackBox track = tb.get(0);
+
+               TrackHeaderBox header = track.getTrackHeaderBox();
+               // Get the creation and modification dates
+               metadata.set(TikaCoreProperties.CREATED, header.getCreationTime());
+               metadata.set(TikaCoreProperties.MODIFIED, header.getModificationTime());
+
+               // Get the video with and height
+               metadata.set(Metadata.IMAGE_WIDTH,  (int)header.getWidth());
+               metadata.set(Metadata.IMAGE_LENGTH, (int)header.getHeight());
+
+               // Get the sample information
+               SampleTableBox samples = track.getSampleTableBox();
+               SampleDescriptionBox sampleDesc = samples.getSampleDescriptionBox();
+               if (sampleDesc != null) {
+                  // Look for the first Audio Sample, if present
+                  AudioSampleEntry sample = getOrNull(sampleDesc, AudioSampleEntry.class);
+                  if (sample != null) {
+                     XMPDM.ChannelTypePropertyConverter.convertAndSet(metadata, sample.getChannelCount());
+                     //metadata.set(XMPDM.AUDIO_SAMPLE_TYPE, sample.getSampleSize());    // TODO Num -> Type mapping
+                     metadata.set(XMPDM.AUDIO_SAMPLE_RATE, (int)sample.getSampleRate());
+                     //metadata.set(XMPDM.AUDIO_, sample.getSamplesPerPacket());
+                     //metadata.set(XMPDM.AUDIO_, sample.getBytesPerSample());
+                  }
+               }
+            }
+
+            // Get metadata from the User Data Box
+            UserDataBox userData = getOrNull(moov, UserDataBox.class);
+            if (userData != null) {
+               MetaBox meta = getOrNull(userData, MetaBox.class);
+
+               // Check for iTunes Metadata
+               // See http://atomicparsley.sourceforge.net/mpeg-4files.html and
+               //  http://code.google.com/p/mp4v2/wiki/iTunesMetadata for more on these
+               AppleItemListBox apple = getOrNull(meta, AppleItemListBox.class);
+               if (apple != null) {
+                  // Title
+                  AppleNameBox title = getOrNull(apple, AppleNameBox.class);
+                  addMetadata(TikaCoreProperties.TITLE, metadata, title);
+
+                  // Artist
+                  AppleArtistBox artist = getOrNull(apple, AppleArtistBox.class);
+                  addMetadata(TikaCoreProperties.CREATOR, metadata, artist);
+                  addMetadata(XMPDM.ARTIST, metadata, artist);
+
+                  // Album
+                  AppleAlbumBox album = getOrNull(apple, AppleAlbumBox.class);
+                  addMetadata(XMPDM.ALBUM, metadata, album);
+
+                  // Composer
+                  AppleTrackAuthorBox composer = getOrNull(apple, AppleTrackAuthorBox.class);
+                  addMetadata(XMPDM.COMPOSER, metadata, composer);
+
+                  // Genre
+                  AppleGenreBox genre = getOrNull(apple, AppleGenreBox.class);
+                  addMetadata(XMPDM.GENRE, metadata, genre);
+
+                  // Year
+                  AppleRecordingYear2Box year = getOrNull(apple, AppleRecordingYear2Box.class);
+                  if (year != null) {
+                      metadata.set(XMPDM.RELEASE_DATE, year.getValue());
+                  }
+
+                  // Track number
+                  AppleTrackNumberBox trackNum = getOrNull(apple, AppleTrackNumberBox.class);
+                  if (trackNum != null) {
+                     metadata.set(XMPDM.TRACK_NUMBER, trackNum.getA());
+                     //metadata.set(XMPDM.NUMBER_OF_TRACKS, trackNum.getB()); // TODO
+                  }
+
+                  // Comment
+                  AppleCommentBox comment = getOrNull(apple, AppleCommentBox.class);
+                  addMetadata(XMPDM.LOG_COMMENT, metadata, comment);
+
+                  // Encoder
+                  AppleEncoderBox encoder = getOrNull(apple, AppleEncoderBox.class);
+                  // addMetadata(XMPDM.???, metadata, encoder); // TODO
+
+
+                  // As text
+                  for (Box box : apple.getBoxes()) {
+                     if (box instanceof Utf8AppleDataBox) {
+                        xhtml.element("p", ((Utf8AppleDataBox)box).getValue());
+                     }
+                  }
+               }
+
+               // TODO Check for other kinds too
+            }
+
+            // All done
+            xhtml.endDocument();
+
+        } finally {
+            tmp.dispose();
         }
 
-        // All done
-        xhtml.endDocument();
     }
     
-    private static void addMetadata(String key, Metadata m, AbstractAppleMetaDataBox metadata) {
+    private static void addMetadata(String key, Metadata m, Utf8AppleDataBox metadata) {
        if (metadata != null) {
           m.add(key, metadata.getValue());
        }
     }
-    private static void addMetadata(Property prop, Metadata m, AbstractAppleMetaDataBox metadata) {
+    private static void addMetadata(Property prop, Metadata m, Utf8AppleDataBox metadata) {
        if (metadata != null) {
           m.set(prop, metadata.getValue());
        }
     }
     
-    /**
-     * MP4 Dates are stored as 32-bit integer, which represent the seconds 
-     * since midnight, January 1, 1904, and are generally in UTC 
-     */
-    private static Date MP4TimeToDate(long mp4Time) {
-       long unix = mp4Time - EPOC_AS_MP4_TIME;
-       return new Date(unix*1000);
-    }
-    private static final long EPOC_AS_MP4_TIME = 2082844800l;
-    
-    private static <T extends Box> T getOrNull(ContainerBox box, Class<T> clazz) {
+    private static <T extends Box> T getOrNull(Container box, Class<T> clazz) {
        if (box == null) return null;
 
        List<T> boxes = box.getBoxes(clazz);