You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/04/13 14:55:46 UTC

[tika] branch branch_1x updated (171f434 -> ca6bf65)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git.


    from 171f434  TIKA-3087 -- general upgrades for 1.24.1
     new 8e2eb05  TIKA-3084 -- upgrade mp4 parser dependency
     new ca6bf65  TIKA-3090 -- extract doc security from ooxml

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 .../tika/metadata/OfficeOpenXMLExtended.java       |  18 +-
 .../src/test/java/org/apache/tika/TikaTest.java    |  16 +
 tika-parsers/pom.xml                               |   4 +-
 .../parser/microsoft/ooxml/MetadataExtractor.java  |  24 +-
 .../tika/parser/mp4/DirectFileReadDataSource.java  | 127 -------
 .../java/org/apache/tika/parser/mp4/MP4Parser.java | 399 +++++++++++----------
 .../parser/microsoft/ooxml/OOXMLParserTest.java    |  11 +
 .../org/apache/tika/parser/mp4/MP4ParserTest.java  |   1 +
 .../test-documents/testWORD_docSecurity.docx       | Bin 0 -> 12861 bytes
 9 files changed, 279 insertions(+), 321 deletions(-)
 delete mode 100644 tika-parsers/src/main/java/org/apache/tika/parser/mp4/DirectFileReadDataSource.java
 create mode 100644 tika-parsers/src/test/resources/test-documents/testWORD_docSecurity.docx


[tika] 01/02: TIKA-3084 -- upgrade mp4 parser dependency

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 8e2eb05292bc35503a3d82a908c426854e23ac83
Author: tallison <ta...@apache.org>
AuthorDate: Fri Apr 10 17:31:49 2020 -0400

    TIKA-3084 -- upgrade mp4 parser dependency
---
 tika-parsers/pom.xml                               |   4 +-
 .../tika/parser/mp4/DirectFileReadDataSource.java  | 127 -------
 .../java/org/apache/tika/parser/mp4/MP4Parser.java | 399 +++++++++++----------
 .../org/apache/tika/parser/mp4/MP4ParserTest.java  |   1 +
 4 files changed, 213 insertions(+), 318 deletions(-)

diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml
index edd1aa3..ac0fdd9 100644
--- a/tika-parsers/pom.xml
+++ b/tika-parsers/pom.xml
@@ -322,9 +322,9 @@
       <version>8.0.1</version>
     </dependency>
     <dependency>
-      <groupId>com.googlecode.mp4parser</groupId>
+      <groupId>org.tallison</groupId>
       <artifactId>isoparser</artifactId>
-      <version>1.1.22</version>
+      <version>1.9.41.2</version>
     </dependency>
     <!-- this is a fork of com.drewnoakes
       metadata extractor that shade/relocates com.adobe.internal
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/mp4/DirectFileReadDataSource.java b/tika-parsers/src/main/java/org/apache/tika/parser/mp4/DirectFileReadDataSource.java
deleted file mode 100644
index 698a106..0000000
--- a/tika-parsers/src/main/java/org/apache/tika/parser/mp4/DirectFileReadDataSource.java
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.mp4;
-
-import static com.googlecode.mp4parser.util.CastUtils.l2i;
-
-import java.io.File;
-import java.io.IOException;
-import java.io.RandomAccessFile;
-import java.math.BigInteger;
-import java.nio.ByteBuffer;
-import java.nio.channels.WritableByteChannel;
-
-import com.googlecode.mp4parser.DataSource;
-
-/**
- * A {@link DataSource} implementation that relies on direct reads from a {@link RandomAccessFile}.
- * It should be slower than {@link com.googlecode.mp4parser.FileDataSourceImpl} but does not incur the implicit file locks of
- * memory mapped I/O on some JVMs. This implementation allows for a more controlled deletion of files
- * and might be preferred when working with temporary files.
- * @see <a href="http://bugs.java.com/view_bug.do?bug_id=4724038">JDK-4724038 : (fs) Add unmap method to MappedByteBuffer</a>
- * @see <a href="http://bugs.java.com/view_bug.do?bug_id=6359560">JDK-6359560 : (fs) File.deleteOnExit() doesn't work when MappedByteBuffer exists (win)</a>
- */
-public class DirectFileReadDataSource implements DataSource {
-
-    private static final int TRANSFER_SIZE = 8192;
-
-    private RandomAccessFile raf;
-
-    public DirectFileReadDataSource(File f) throws IOException {
-        this.raf = new RandomAccessFile(f, "r");
-    }
-
-    public int read(ByteBuffer byteBuffer) throws IOException {
-        int len = byteBuffer.remaining();
-        int totalRead = 0;
-        int bytesRead = 0;
-        byte[] buf = new byte[TRANSFER_SIZE];
-        while (totalRead < len) {
-            int bytesToRead = Math.min((len - totalRead), TRANSFER_SIZE);
-            bytesRead = raf.read(buf, 0, bytesToRead);
-            if (bytesRead < 0) {
-                break;
-            } else {
-                totalRead += bytesRead;
-            }
-            byteBuffer.put(buf, 0, bytesRead);
-        }
-        if (bytesRead < 0 && position() == size() && byteBuffer.hasRemaining()) {
-            throw new IOException("End of stream reached earlier than expected");
-        }
-        return ((bytesRead < 0) && (totalRead == 0)) ? -1 : totalRead;
-    }
-
-    public int readAllInOnce(ByteBuffer byteBuffer) throws IOException {
-        if (byteBuffer.remaining() > raf.length()) {
-            throw new IOException("trying to readAllInOnce past end of stream");
-        }
-        byte[] buf = new byte[byteBuffer.remaining()];
-        int read = raf.read(buf);
-        byteBuffer.put(buf, 0, read);
-        return read;
-    }
-
-    public long size() throws IOException {
-        return raf.length();
-    }
-
-    public long position() throws IOException {
-        return raf.getFilePointer();
-    }
-
-    public void position(long nuPos) throws IOException {
-        if (nuPos > raf.length()) {
-            throw new IOException("requesting seek past end of stream");
-        }
-        raf.seek(nuPos);
-    }
-
-    public long transferTo(long position, long count, WritableByteChannel target) throws IOException {
-        return target.write(map(position, count));
-    }
-
-    public ByteBuffer map(long startPosition, long size) throws IOException {
-        if (startPosition < 0 || size < 0) {
-            throw new IOException("startPosition and size must both be >= 0");
-        }
-        //make sure that start+size aren't greater than avail size
-        //in raf.
-        BigInteger end = BigInteger.valueOf(startPosition);
-        end = end.add(BigInteger.valueOf(size));
-        if (end.compareTo(BigInteger.valueOf(raf.length())) > 0) {
-            throw new IOException("requesting read past end of stream");
-        }
-
-        raf.seek(startPosition);
-        int payLoadSize = l2i(size);
-        //hack to check for potential overflow
-        if (Long.MAX_VALUE-payLoadSize < startPosition ||
-                Long.MAX_VALUE-payLoadSize > raf.length()) {
-            throw new IOException("requesting read past end of stream");
-        }
-        byte[] payload = new byte[payLoadSize];
-        raf.readFully(payload);
-        return ByteBuffer.wrap(payload);
-    }
-
-    @Override
-    public void close() throws IOException {
-        raf.close();
-    }
-
-}
\ No newline at end of file
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/mp4/MP4Parser.java b/tika-parsers/src/main/java/org/apache/tika/parser/mp4/MP4Parser.java
index f568839..ea8d94d 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/mp4/MP4Parser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/mp4/MP4Parser.java
@@ -16,36 +16,7 @@
  */
 package org.apache.tika.parser.mp4;
 
-
-import com.coremedia.iso.IsoFile;
-import com.coremedia.iso.boxes.Box;
-import com.coremedia.iso.boxes.Container;
-import com.coremedia.iso.boxes.FileTypeBox;
-import com.coremedia.iso.boxes.MetaBox;
-import com.coremedia.iso.boxes.MovieBox;
-import com.coremedia.iso.boxes.MovieHeaderBox;
-import com.coremedia.iso.boxes.SampleDescriptionBox;
-import com.coremedia.iso.boxes.SampleTableBox;
-import com.coremedia.iso.boxes.TrackBox;
-import com.coremedia.iso.boxes.TrackHeaderBox;
-import com.coremedia.iso.boxes.UserDataBox;
-import com.coremedia.iso.boxes.apple.AppleItemListBox;
-import com.coremedia.iso.boxes.sampleentry.AudioSampleEntry;
-import com.googlecode.mp4parser.DataSource;
-import com.googlecode.mp4parser.boxes.apple.AppleAlbumBox;
-import com.googlecode.mp4parser.boxes.apple.AppleArtist2Box;
-import com.googlecode.mp4parser.boxes.apple.AppleArtistBox;
-import com.googlecode.mp4parser.boxes.apple.AppleCommentBox;
-import com.googlecode.mp4parser.boxes.apple.AppleCompilationBox;
-import com.googlecode.mp4parser.boxes.apple.AppleDiskNumberBox;
-import com.googlecode.mp4parser.boxes.apple.AppleEncoderBox;
-import com.googlecode.mp4parser.boxes.apple.AppleGPSCoordinatesBox;
-import com.googlecode.mp4parser.boxes.apple.AppleGenreBox;
-import com.googlecode.mp4parser.boxes.apple.AppleNameBox;
-import com.googlecode.mp4parser.boxes.apple.AppleRecordingYear2Box;
-import com.googlecode.mp4parser.boxes.apple.AppleTrackAuthorBox;
-import com.googlecode.mp4parser.boxes.apple.AppleTrackNumberBox;
-import com.googlecode.mp4parser.boxes.apple.Utf8AppleDataBox;
+import org.apache.tika.config.Field;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.TemporaryResources;
 import org.apache.tika.io.TikaInputStream;
@@ -58,6 +29,34 @@ import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AbstractParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.sax.XHTMLContentHandler;
+import org.mp4parser.Box;
+import org.mp4parser.Container;
+import org.mp4parser.IsoFile;
+import org.mp4parser.boxes.apple.AppleAlbumBox;
+import org.mp4parser.boxes.apple.AppleArtist2Box;
+import org.mp4parser.boxes.apple.AppleArtistBox;
+import org.mp4parser.boxes.apple.AppleCommentBox;
+import org.mp4parser.boxes.apple.AppleCompilationBox;
+import org.mp4parser.boxes.apple.AppleDiskNumberBox;
+import org.mp4parser.boxes.apple.AppleEncoderBox;
+import org.mp4parser.boxes.apple.AppleGPSCoordinatesBox;
+import org.mp4parser.boxes.apple.AppleGenreBox;
+import org.mp4parser.boxes.apple.AppleItemListBox;
+import org.mp4parser.boxes.apple.AppleNameBox;
+import org.mp4parser.boxes.apple.AppleRecordingYear2Box;
+import org.mp4parser.boxes.apple.AppleTrackAuthorBox;
+import org.mp4parser.boxes.apple.AppleTrackNumberBox;
+import org.mp4parser.boxes.apple.Utf8AppleDataBox;
+import org.mp4parser.boxes.iso14496.part12.FileTypeBox;
+import org.mp4parser.boxes.iso14496.part12.MetaBox;
+import org.mp4parser.boxes.iso14496.part12.MovieBox;
+import org.mp4parser.boxes.iso14496.part12.MovieHeaderBox;
+import org.mp4parser.boxes.iso14496.part12.SampleDescriptionBox;
+import org.mp4parser.boxes.iso14496.part12.SampleTableBox;
+import org.mp4parser.boxes.iso14496.part12.TrackBox;
+import org.mp4parser.boxes.iso14496.part12.TrackHeaderBox;
+import org.mp4parser.boxes.iso14496.part12.UserDataBox;
+import org.mp4parser.boxes.sampleentry.AudioSampleEntry;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
@@ -103,7 +102,7 @@ public class MP4Parser extends AbstractParser {
              "mp41", "mp42"));
        typesMap.put(MediaType.video("x-m4v"), Arrays.asList(
              "M4V ", "M4VH", "M4VP"));
-       
+
        typesMap.put(MediaType.video("quicktime"), Collections.<String>emptyList());
        typesMap.put(MediaType.application("mp4"), Collections.<String>emptyList());
     }
@@ -117,7 +116,6 @@ public class MP4Parser extends AbstractParser {
         return SUPPORTED_TYPES;
     }
 
-
     public void parse(
             InputStream stream, ContentHandler handler,
             Metadata metadata, ParseContext context)
@@ -129,186 +127,209 @@ public class MP4Parser extends AbstractParser {
         TemporaryResources tmp = new TemporaryResources();
         TikaInputStream tstream = TikaInputStream.get(stream, tmp);
 
-        try (DataSource dataSource = new DirectFileReadDataSource(tstream.getFile())) {
-            try (IsoFile isoFile = new IsoFile(dataSource)) {
-                tmp.addResource(isoFile);
-
-                // Grab the file type box
-                FileTypeBox fileType = getOrNull(isoFile, FileTypeBox.class);
-                if (fileType != null) {
-                    // Identify the type
-                    MediaType type = MediaType.application("mp4");
-                    for (Map.Entry<MediaType, List<String>> e : typesMap.entrySet()) {
-                        if (e.getValue().contains(fileType.getMajorBrand())) {
-                            type = e.getKey();
-                            break;
-                        }
+        try (IsoFile isoFile = new IsoFile(tstream.getFile())) {
+            tmp.addResource(isoFile);
+
+            // Grab the file type box
+            FileTypeBox fileType = getOrNull(isoFile, FileTypeBox.class);
+            if (fileType != null) {
+                // Identify the type
+                MediaType type = MediaType.application("mp4");
+                for (Map.Entry<MediaType, List<String>> e : typesMap.entrySet()) {
+                    if (e.getValue().contains(fileType.getMajorBrand())) {
+                        type = e.getKey();
+                        break;
                     }
-                    metadata.set(Metadata.CONTENT_TYPE, type.toString());
+                }
+                metadata.set(Metadata.CONTENT_TYPE, type.toString());
 
-                    if (type.getType().equals("audio")) {
-                        metadata.set(XMPDM.AUDIO_COMPRESSOR, fileType.getMajorBrand().trim());
-                    }
-                } else {
-                    // Some older QuickTime files lack the FileType
-                    metadata.set(Metadata.CONTENT_TYPE, "video/quicktime");
+                if (type.getType().equals("audio")) {
+                    metadata.set(XMPDM.AUDIO_COMPRESSOR, fileType.getMajorBrand().trim());
                 }
+            } else {
+                // Some older QuickTime files lack the FileType
+                metadata.set(Metadata.CONTENT_TYPE, "video/quicktime");
+            }
 
 
-                // Get the main MOOV box
-                MovieBox moov = getOrNull(isoFile, MovieBox.class);
-                if (moov == null) {
-                    // Bail out
-                    return;
-                }
+            // Get the main MOOV box
+            MovieBox moov = getOrNull(isoFile, MovieBox.class);
+            if (moov == null) {
+                // Bail out
+                return;
+            }
 
 
-                XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
-                xhtml.startDocument();
+            XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+            xhtml.startDocument();
 
+            handleMovieHeaderBox(moov, metadata, xhtml);
+            handleTrackBoxes(moov, metadata, xhtml);
 
-                // Pull out some information from the header box
-                MovieHeaderBox mHeader = getOrNull(moov, MovieHeaderBox.class);
-                if (mHeader != null) {
-                    // Get the creation and modification dates
-                    metadata.set(Metadata.CREATION_DATE, mHeader.getCreationTime());
-                    metadata.set(TikaCoreProperties.MODIFIED, mHeader.getModificationTime());
+            // Get metadata from the User Data Box
+            UserDataBox userData = getOrNull(moov, UserDataBox.class);
+            if (userData != null) {
+                extractGPS(userData, metadata);
+                MetaBox metaBox = getOrNull(userData, MetaBox.class);
 
-                    // Get the duration
-                    double durationSeconds = ((double) mHeader.getDuration()) / mHeader.getTimescale();
-                    metadata.set(XMPDM.DURATION, DURATION_FORMAT.format(durationSeconds));
+                // Check for iTunes Metadata
+                // See http://atomicparsley.sourceforge.net/mpeg-4files.html and
+                //  http://code.google.com/p/mp4v2/wiki/iTunesMetadata for more on these
+                handleApple(metaBox, metadata, xhtml);
+                // TODO Check for other kinds too
+            }
 
-                    // The timescale is normally the sampling rate
-                    metadata.set(XMPDM.AUDIO_SAMPLE_RATE, (int) mHeader.getTimescale());
-                }
+            // All done
+            xhtml.endDocument();
 
+        } finally {
+            tmp.dispose();
+        }
 
-                // Get some more information from the track header
-                // TODO Decide how to handle multiple tracks
-                List<TrackBox> tb = moov.getBoxes(TrackBox.class);
-                if (tb.size() > 0) {
-                    TrackBox track = tb.get(0);
-
-                    TrackHeaderBox header = track.getTrackHeaderBox();
-                    // Get the creation and modification dates
-                    metadata.set(TikaCoreProperties.CREATED, header.getCreationTime());
-                    metadata.set(TikaCoreProperties.MODIFIED, header.getModificationTime());
-
-                    // Get the video with and height
-                    metadata.set(Metadata.IMAGE_WIDTH, (int) header.getWidth());
-                    metadata.set(Metadata.IMAGE_LENGTH, (int) header.getHeight());
-
-                    // Get the sample information
-                    SampleTableBox samples = track.getSampleTableBox();
-                    if (samples !=  null) {
-                        SampleDescriptionBox sampleDesc = samples.getSampleDescriptionBox();
-                        if (sampleDesc != null) {
-                            // Look for the first Audio Sample, if present
-                            AudioSampleEntry sample = getOrNull(sampleDesc, AudioSampleEntry.class);
-                            if (sample != null) {
-                                XMPDM.ChannelTypePropertyConverter.convertAndSet(metadata, sample.getChannelCount());
-                                //metadata.set(XMPDM.AUDIO_SAMPLE_TYPE, sample.getSampleSize());    // TODO Num -> Type mapping
-                                metadata.set(XMPDM.AUDIO_SAMPLE_RATE, (int) sample.getSampleRate());
-                                //metadata.set(XMPDM.AUDIO_, sample.getSamplesPerPacket());
-                                //metadata.set(XMPDM.AUDIO_, sample.getBytesPerSample());
-                            }
-                        }
-                    }
-                }
+    }
 
-                // Get metadata from the User Data Box
-                UserDataBox userData = getOrNull(moov, UserDataBox.class);
-                if (userData != null) {
-                    extractGPS(userData, metadata);
-                    MetaBox meta = getOrNull(userData, MetaBox.class);
-
-                    // Check for iTunes Metadata
-                    // See http://atomicparsley.sourceforge.net/mpeg-4files.html and
-                    //  http://code.google.com/p/mp4v2/wiki/iTunesMetadata for more on these
-                    AppleItemListBox apple = getOrNull(meta, AppleItemListBox.class);
-                    if (apple != null) {
-                        // Title
-                        AppleNameBox title = getOrNull(apple, AppleNameBox.class);
-                        addMetadata(TikaCoreProperties.TITLE, metadata, title);
-
-                        // Artist
-                        AppleArtistBox artist = getOrNull(apple, AppleArtistBox.class);
-                        addMetadata(TikaCoreProperties.CREATOR, metadata, artist);
-                        addMetadata(XMPDM.ARTIST, metadata, artist);
-
-                        // Album Artist
-                        AppleArtist2Box artist2 = getOrNull(apple, AppleArtist2Box.class);
-                        addMetadata(XMPDM.ALBUM_ARTIST, metadata, artist2);
-
-                        // Album
-                        AppleAlbumBox album = getOrNull(apple, AppleAlbumBox.class);
-                        addMetadata(XMPDM.ALBUM, metadata, album);
-
-                        // Composer
-                        AppleTrackAuthorBox composer = getOrNull(apple, AppleTrackAuthorBox.class);
-                        addMetadata(XMPDM.COMPOSER, metadata, composer);
-
-                        // Genre
-                        AppleGenreBox genre = getOrNull(apple, AppleGenreBox.class);
-                        addMetadata(XMPDM.GENRE, metadata, genre);
-
-                        // Year
-                        AppleRecordingYear2Box year = getOrNull(apple, AppleRecordingYear2Box.class);
-                        if (year != null) {
-                            metadata.set(XMPDM.RELEASE_DATE, year.getValue());
-                        }
-
-                        // Track number
-                        AppleTrackNumberBox trackNum = getOrNull(apple, AppleTrackNumberBox.class);
-                        if (trackNum != null) {
-                            metadata.set(XMPDM.TRACK_NUMBER, trackNum.getA());
-                            //metadata.set(XMPDM.NUMBER_OF_TRACKS, trackNum.getB()); // TODO
-                        }
-
-                        // Disc number
-                        AppleDiskNumberBox discNum = getOrNull(apple, AppleDiskNumberBox.class);
-                        if (discNum != null) {
-                            metadata.set(XMPDM.DISC_NUMBER, discNum.getA());
-                        }
-
-                        // Compilation
-                        AppleCompilationBox compilation = getOrNull(apple, AppleCompilationBox.class);
-                        if (compilation != null) {
-                            metadata.set(XMPDM.COMPILATION, (int) compilation.getValue());
-                        }
-
-                        // Comment
-                        AppleCommentBox comment = getOrNull(apple, AppleCommentBox.class);
-                        addMetadata(XMPDM.LOG_COMMENT, metadata, comment);
-
-                        // Encoder
-                        AppleEncoderBox encoder = getOrNull(apple, AppleEncoderBox.class);
-                        if (encoder != null) {
-                            metadata.set(XMP.CREATOR_TOOL, encoder.getValue());
-                        }
-
-
-                        // As text
-                        for (Box box : apple.getBoxes()) {
-                            if (box instanceof Utf8AppleDataBox) {
-                                xhtml.element("p", ((Utf8AppleDataBox) box).getValue());
-                            }
-                        }
-                    }
+    private void handleTrackBoxes(MovieBox moov, Metadata metadata, XHTMLContentHandler xhtml) {
 
-                    // TODO Check for other kinds too
+        // Get some more information from the track header
+        // TODO Decide how to handle multiple tracks
+        List<TrackBox> tb = moov.getBoxes(TrackBox.class);
+        if (tb == null || tb.size() == 0) {
+            return;
+        }
+        TrackBox track = tb.get(0);
+
+        TrackHeaderBox header = track.getTrackHeaderBox();
+        // Get the creation and modification dates
+        metadata.set(TikaCoreProperties.CREATED, header.getCreationTime());
+        metadata.set(TikaCoreProperties.MODIFIED, header.getModificationTime());
+
+        // Get the video with and height
+        metadata.set(Metadata.IMAGE_WIDTH, (int) header.getWidth());
+        metadata.set(Metadata.IMAGE_LENGTH, (int) header.getHeight());
+
+        // Get the sample information
+        SampleTableBox samples = track.getSampleTableBox();
+        if (samples != null) {
+            SampleDescriptionBox sampleDesc = samples.getSampleDescriptionBox();
+            if (sampleDesc != null) {
+                // Look for the first Audio Sample, if present
+                AudioSampleEntry sample = getOrNull(sampleDesc, AudioSampleEntry.class);
+                if (sample != null) {
+                    XMPDM.ChannelTypePropertyConverter.convertAndSet(metadata, sample.getChannelCount());
+                    //metadata.set(XMPDM.AUDIO_SAMPLE_TYPE, sample.getSampleSize());    // TODO Num -> Type mapping
+                    metadata.set(XMPDM.AUDIO_SAMPLE_RATE, (int) sample.getSampleRate());
+                    //metadata.set(XMPDM.AUDIO_, sample.getSamplesPerPacket());
+                    //metadata.set(XMPDM.AUDIO_, sample.getBytesPerSample());
                 }
+            }
+        }
+    }
 
+    private void handleMovieHeaderBox(MovieBox moov, Metadata metadata, XHTMLContentHandler xhtml) {
+        // Pull out some information from the header box
+        MovieHeaderBox mHeader = getOrNull(moov, MovieHeaderBox.class);
+        if (mHeader == null) {
+            return;
+        }
+        // Get the creation and modification dates
+        metadata.set(TikaCoreProperties.CREATED, mHeader.getCreationTime());
+        metadata.set(TikaCoreProperties.MODIFIED, mHeader.getModificationTime());
+
+        // Get the duration
+        double durationSeconds = ((double) mHeader.getDuration()) / mHeader.getTimescale();
+        metadata.set(XMPDM.DURATION, DURATION_FORMAT.format(durationSeconds));
+
+        // The timescale is normally the sampling rate
+        metadata.set(XMPDM.AUDIO_SAMPLE_RATE, (int) mHeader.getTimescale());
+    }
+
+    private void handleApple(MetaBox metaBox, Metadata metadata, XHTMLContentHandler xhtml) throws SAXException {
+        AppleItemListBox apple = getOrNull(metaBox, AppleItemListBox.class);
+        if (apple == null) {
+            return;
+        }
+        // Title
+        AppleNameBox title = getOrNull(apple, AppleNameBox.class);
+        addMetadata(TikaCoreProperties.TITLE, metadata, title);
+
+        // Artist
+        AppleArtistBox artist = getOrNull(apple, AppleArtistBox.class);
+        addMetadata(TikaCoreProperties.CREATOR, metadata, artist);
+        addMetadata(XMPDM.ARTIST, metadata, artist);
+
+        // Album Artist
+        AppleArtist2Box artist2 = getOrNull(apple, AppleArtist2Box.class);
+        addMetadata(XMPDM.ALBUM_ARTIST, metadata, artist2);
+
+        // Album
+        AppleAlbumBox album = getOrNull(apple, AppleAlbumBox.class);
+        addMetadata(XMPDM.ALBUM, metadata, album);
+
+        // Composer
+        AppleTrackAuthorBox composer = getOrNull(apple, AppleTrackAuthorBox.class);
+        addMetadata(XMPDM.COMPOSER, metadata, composer);
+
+        // Genre
+        AppleGenreBox genre = getOrNull(apple, AppleGenreBox.class);
+        addMetadata(XMPDM.GENRE, metadata, genre);
+
+        // Year
+        AppleRecordingYear2Box year = getOrNull(apple, AppleRecordingYear2Box.class);
+        if (year != null) {
+            metadata.set(XMPDM.RELEASE_DATE, year.getValue());
+        }
+
+        // Track number
+        AppleTrackNumberBox trackNum = getOrNull(apple, AppleTrackNumberBox.class);
+        if (trackNum != null) {
+            metadata.set(XMPDM.TRACK_NUMBER, trackNum.getA());
+            //metadata.set(XMPDM.NUMBER_OF_TRACKS, trackNum.getB()); // TODO
+        }
+
+        // Disc number
+        AppleDiskNumberBox discNum = getOrNull(apple, AppleDiskNumberBox.class);
+        if (discNum != null) {
+            metadata.set(XMPDM.DISC_NUMBER, discNum.getA());
+        }
+
+        // Compilation
+        AppleCompilationBox compilation = getOrNull(apple, AppleCompilationBox.class);
+        if (compilation != null) {
+            metadata.set(XMPDM.COMPILATION, (int) compilation.getValue());
+        }
+
+        // Comment
+        AppleCommentBox comment = getOrNull(apple, AppleCommentBox.class);
+        addMetadata(XMPDM.LOG_COMMENT, metadata, comment);
+
+        // Encoder
+        AppleEncoderBox encoder = getOrNull(apple, AppleEncoderBox.class);
+        if (encoder != null) {
+            metadata.set(XMP.CREATOR_TOOL, encoder.getValue());
+        }
 
-                // All done
-                xhtml.endDocument();
+
+        // As text
+        for (Box box : apple.getBoxes()) {
+            if (box instanceof Utf8AppleDataBox) {
+                xhtml.element("p", ((Utf8AppleDataBox) box).getValue());
             }
-        } finally {
-            tmp.dispose();
         }
 
     }
 
+    /**
+     * Override the maximum record size limit.  NOTE: this
+     * sets a static variable on the IsoFile and affects all files
+     * parsed in this JVM!!!
+     *
+     * @param maxRecordSize
+     */
+    @Field
+    public void setMaxRecordSize(long maxRecordSize) {
+        IsoFile.MAX_RECORD_SIZE_OVERRIDE = maxRecordSize;
+    }
+
     private void extractGPS(UserDataBox userData, Metadata metadata) {
         AppleGPSCoordinatesBox coordBox = getOrNull(userData, AppleGPSCoordinatesBox.class);
         if (coordBox == null) {
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/mp4/MP4ParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/mp4/MP4ParserTest.java
index 531733f..12eb1a9 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/mp4/MP4ParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/mp4/MP4ParserTest.java
@@ -19,6 +19,7 @@ package org.apache.tika.parser.mp4;
 import static org.junit.Assert.assertEquals;
 
 import java.io.InputStream;
+import java.nio.file.Paths;
 
 import org.apache.tika.TikaTest;
 import org.apache.tika.io.TikaInputStream;


[tika] 02/02: TIKA-3090 -- extract doc security from ooxml

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit ca6bf65b26ab52c19208457510e16fd5db1ba440
Author: tallison <ta...@apache.org>
AuthorDate: Mon Apr 13 09:33:19 2020 -0400

    TIKA-3090 -- extract doc security from ooxml
    
    # Conflicts:
    #	tika-core/src/main/java/org/apache/tika/metadata/OfficeOpenXMLExtended.java
---
 .../tika/metadata/OfficeOpenXMLExtended.java       |  18 ++++++++++++++--
 .../src/test/java/org/apache/tika/TikaTest.java    |  16 ++++++++++++++
 .../parser/microsoft/ooxml/MetadataExtractor.java  |  24 ++++++++++++++++++++-
 .../parser/microsoft/ooxml/OOXMLParserTest.java    |  11 ++++++++++
 .../test-documents/testWORD_docSecurity.docx       | Bin 0 -> 12861 bytes
 5 files changed, 66 insertions(+), 3 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/metadata/OfficeOpenXMLExtended.java b/tika-core/src/main/java/org/apache/tika/metadata/OfficeOpenXMLExtended.java
index 5829339..da1f484 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/OfficeOpenXMLExtended.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/OfficeOpenXMLExtended.java
@@ -34,6 +34,12 @@ public interface OfficeOpenXMLExtended
     String WORD_PROCESSING_NAMESPACE_URI = "http://schemas.openxmlformats.org/wordprocessingml/2006/main";
     String PREFIX = "extended-properties";
     String WORD_PROCESSING_PREFIX = "w";
+    String SECURITY_NONE = "None";
+    String SECURITY_PASSWORD_PROTECTED = "PasswordProtected";
+    String SECURITY_READ_ONLY_RECOMMENDED = "ReadOnlyRecommended";
+    String SECURITY_READ_ONLY_ENFORCED = "ReadOnlyEnforced";
+    String SECURITY_LOCKED_FOR_ANNOTATIONS = "LockedForAnnotations";
+    String SECURITY_UNKNOWN = "Unknown";
 
     Property TEMPLATE = Property.externalText(
     		PREFIX + Metadata.NAMESPACE_PREFIX_DELIMITER + "Template");
@@ -60,10 +66,18 @@ public interface OfficeOpenXMLExtended
     
     Property APP_VERSION = Property.externalText(
     		PREFIX + Metadata.NAMESPACE_PREFIX_DELIMITER + "AppVersion");
-    
+
+    //Integer flag
     Property DOC_SECURITY = Property.externalInteger(
     		PREFIX + Metadata.NAMESPACE_PREFIX_DELIMITER + "DocSecurity");
-    
+
+    //Human readable string explaining doc security flag
+    Property DOC_SECURITY_STRING = Property.externalClosedChoise(
+            PREFIX + Metadata.NAMESPACE_PREFIX_DELIMITER +
+            "DocSecurityString", SECURITY_NONE, SECURITY_PASSWORD_PROTECTED,
+            SECURITY_READ_ONLY_RECOMMENDED, SECURITY_READ_ONLY_ENFORCED,
+            SECURITY_LOCKED_FOR_ANNOTATIONS, SECURITY_UNKNOWN);
+
     Property COMMENTS = Property.externalTextBag(
             WORD_PROCESSING_PREFIX + Metadata.NAMESPACE_PREFIX_DELIMITER + "comments");
 }
\ No newline at end of file
diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index efb93b7..5c50ea3 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -29,6 +29,7 @@ import java.io.InputStream;
 import java.net.URISyntaxException;
 import java.net.URL;
 import java.nio.file.Path;
+import java.nio.file.Paths;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
@@ -490,4 +491,19 @@ public abstract class TikaTest {
         return null;
     }
 
+    public List<Path> getAllTestFiles() {
+        //for now, just get main files
+        //TODO: fix this to be recursive
+        try {
+            File[] pathArray = Paths.get(this.getClass().getResource("/test-documents")
+                    .toURI()).toFile().listFiles();
+            List<Path> paths = new ArrayList<>();
+            for (File f : pathArray) {
+                paths.add(f.toPath());
+            }
+            return paths;
+        } catch (URISyntaxException e) {
+            throw new RuntimeException(e);
+        }
+    }
 }
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
index e5da8ce..9fb8224 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
@@ -137,7 +137,10 @@ public class MetadataExtractor {
         setProperty(metadata, OfficeOpenXMLExtended.PRESENTATION_FORMAT, propsHolder.getPresentationFormat());
         setProperty(metadata, OfficeOpenXMLExtended.TEMPLATE, propsHolder.getTemplate());
         setProperty(metadata, OfficeOpenXMLExtended.TOTAL_TIME, totalTime);
-
+        int docSecurityFlag = propsHolder.getDocSecurity();
+        setProperty(metadata, OfficeOpenXMLExtended.DOC_SECURITY, docSecurityFlag);
+        setProperty(metadata, OfficeOpenXMLExtended.DOC_SECURITY_STRING,
+                getDocSecurityString(docSecurityFlag));
         if (propsHolder.getPages() > 0) {
             metadata.set(PagedText.N_PAGES, propsHolder.getPages());
         } else if (propsHolder.getSlides() > 0) {
@@ -171,6 +174,25 @@ public class MetadataExtractor {
         setProperty(metadata, MSOffice.CHARACTER_COUNT_WITH_SPACES, propsHolder.getCharactersWithSpaces());
     }
 
+    private String getDocSecurityString(int docSecurityFlag) {
+        //mappings from: https://exiftool.org/TagNames/OOXML.html and
+        //https://docs.microsoft.com/en-us/dotnet/api/documentformat.openxml.extendedproperties.documentsecurity?view=openxml-2.8.1
+        switch(docSecurityFlag) {
+            case 0:
+                return OfficeOpenXMLExtended.SECURITY_NONE;
+            case 1:
+                return OfficeOpenXMLExtended.SECURITY_PASSWORD_PROTECTED;
+            case 2:
+                return OfficeOpenXMLExtended.SECURITY_READ_ONLY_RECOMMENDED;
+            case 4:
+                return OfficeOpenXMLExtended.SECURITY_READ_ONLY_ENFORCED;
+            case 8:
+                return OfficeOpenXMLExtended.SECURITY_LOCKED_FOR_ANNOTATIONS;
+            default:
+                return OfficeOpenXMLExtended.SECURITY_UNKNOWN;
+        }
+    }
+
     private void extractMetadata(POIXMLProperties.CustomProperties properties,
                                  Metadata metadata) {
         org.openxmlformats.schemas.officeDocument.x2006.customProperties.CTProperties
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index b48ddae..bdbc9e4 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -31,6 +31,7 @@ import java.io.File;
 import java.io.InputStream;
 import java.io.PrintStream;
 import java.io.StringWriter;
+import java.nio.file.Path;
 import java.text.DecimalFormatSymbols;
 import java.util.Arrays;
 import java.util.HashMap;
@@ -1786,6 +1787,16 @@ public class OOXMLParserTest extends TikaTest {
         assertContains("2018-09-20", xml);
         assertContains("1996-08-10", xml);
     }
+
+    @Test
+    public void testDocSecurity() throws Exception {
+        assertEquals(OfficeOpenXMLExtended.SECURITY_PASSWORD_PROTECTED,
+                getRecursiveMetadata("protectedFile.xlsx")
+                .get(0).get(OfficeOpenXMLExtended.DOC_SECURITY_STRING));
+        assertEquals(OfficeOpenXMLExtended.SECURITY_READ_ONLY_ENFORCED,
+                getRecursiveMetadata("testWORD_docSecurity.docx")
+                        .get(0).get(OfficeOpenXMLExtended.DOC_SECURITY_STRING));
+    }
 }
 
 
diff --git a/tika-parsers/src/test/resources/test-documents/testWORD_docSecurity.docx b/tika-parsers/src/test/resources/test-documents/testWORD_docSecurity.docx
new file mode 100644
index 0000000..14a8196
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testWORD_docSecurity.docx differ