You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by bo...@apache.org on 2015/12/29 00:22:47 UTC
svn commit: r1722029 [3/4] - in /tika/branches/2.x: tika-parser-modules/
tika-parser-modules/tika-multimedia-module/
tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/module/
tika-parser-modules/tika-multimedia-module/src/main/ja...
Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java?rev=1722029&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java Mon Dec 28 23:22:46 2015
@@ -0,0 +1,246 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TailStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.XMPDM;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.mp3.ID3Tags.ID3Comment;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * The <code>Mp3Parser</code> is used to parse ID3 Version 1 Tag information
+ * from an MP3 file, if available.
+ *
+ * @see <a href="http://www.id3.org/ID3v1">MP3 ID3 Version 1 specification</a>
+ * @see <a href="http://www.id3.org/id3v2.4.0-structure">MP3 ID3 Version 2.4 Structure Specification</a>
+ * @see <a href="http://www.id3.org/id3v2.4.0-frames">MP3 ID3 Version 2.4 Frames Specification</a>
+ */
+public class Mp3Parser extends AbstractParser {
+
+ /** Serial version UID */
+ private static final long serialVersionUID = 8537074922934844370L;
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.singleton(MediaType.audio("mpeg"));
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ metadata.set(Metadata.CONTENT_TYPE, "audio/mpeg");
+ metadata.set(XMPDM.AUDIO_COMPRESSOR, "MP3");
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+
+ // Create handlers for the various kinds of ID3 tags
+ ID3TagsAndAudio audioAndTags = getAllTagHandlers(stream, handler);
+
+ // Process tags metadata if the file has supported tags
+ if (audioAndTags.tags.length > 0) {
+ CompositeTagHandler tag = new CompositeTagHandler(audioAndTags.tags);
+
+ metadata.set(TikaCoreProperties.TITLE, tag.getTitle());
+ metadata.set(TikaCoreProperties.CREATOR, tag.getArtist());
+ metadata.set(XMPDM.ARTIST, tag.getArtist());
+ metadata.set(XMPDM.ALBUM_ARTIST, tag.getAlbumArtist());
+ metadata.set(XMPDM.COMPOSER, tag.getComposer());
+ metadata.set(XMPDM.ALBUM, tag.getAlbum());
+ metadata.set(XMPDM.COMPILATION, tag.getCompilation());
+ metadata.set(XMPDM.RELEASE_DATE, tag.getYear());
+ metadata.set(XMPDM.GENRE, tag.getGenre());
+
+ List<String> comments = new ArrayList<String>();
+ for (ID3Comment comment : tag.getComments()) {
+ StringBuffer cmt = new StringBuffer();
+ if (comment.getLanguage() != null) {
+ cmt.append(comment.getLanguage());
+ cmt.append(" - ");
+ }
+ if (comment.getDescription() != null) {
+ cmt.append(comment.getDescription());
+ if (comment.getText() != null) {
+ cmt.append("\n");
+ }
+ }
+ if (comment.getText() != null) {
+ cmt.append(comment.getText());
+ }
+
+ comments.add(cmt.toString());
+ metadata.add(XMPDM.LOG_COMMENT.getName(), cmt.toString());
+ }
+
+ xhtml.element("h1", tag.getTitle());
+ xhtml.element("p", tag.getArtist());
+
+ // ID3v1.1 Track addition
+ StringBuilder sb = new StringBuilder();
+ sb.append(tag.getAlbum());
+ if (tag.getTrackNumber() != null) {
+ sb.append(", track ").append(tag.getTrackNumber());
+ metadata.set(XMPDM.TRACK_NUMBER, tag.getTrackNumber());
+ }
+ if (tag.getDisc() != null) {
+ sb.append(", disc ").append(tag.getDisc());
+ metadata.set(XMPDM.DISC_NUMBER, tag.getDisc());
+ }
+ xhtml.element("p", sb.toString());
+
+ xhtml.element("p", tag.getYear());
+ xhtml.element("p", tag.getGenre());
+ xhtml.element("p", String.valueOf(audioAndTags.duration));
+ for (String comment : comments) {
+ xhtml.element("p", comment);
+ }
+ }
+ if (audioAndTags.duration > 0) {
+ metadata.set(XMPDM.DURATION, audioAndTags.duration);
+ }
+ if (audioAndTags.audio != null) {
+ metadata.set("samplerate", String.valueOf(audioAndTags.audio.getSampleRate()));
+ metadata.set("channels", String.valueOf(audioAndTags.audio.getChannels()));
+ metadata.set("version", audioAndTags.audio.getVersion());
+
+ metadata.set(
+ XMPDM.AUDIO_SAMPLE_RATE,
+ Integer.toString(audioAndTags.audio.getSampleRate()));
+ if(audioAndTags.audio.getChannels() == 1) {
+ metadata.set(XMPDM.AUDIO_CHANNEL_TYPE, "Mono");
+ } else if(audioAndTags.audio.getChannels() == 2) {
+ metadata.set(XMPDM.AUDIO_CHANNEL_TYPE, "Stereo");
+ } else if(audioAndTags.audio.getChannels() == 5) {
+ metadata.set(XMPDM.AUDIO_CHANNEL_TYPE, "5.1");
+ } else if(audioAndTags.audio.getChannels() == 7) {
+ metadata.set(XMPDM.AUDIO_CHANNEL_TYPE, "7.1");
+ }
+ }
+ if (audioAndTags.lyrics != null && audioAndTags.lyrics.hasLyrics()) {
+ xhtml.startElement("p", "class", "lyrics");
+ xhtml.characters(audioAndTags.lyrics.lyricsText);
+ xhtml.endElement("p");
+ }
+
+ xhtml.endDocument();
+ }
+
+ /**
+ * Scans the MP3 frames for ID3 tags, and creates ID3Tag Handlers
+ * for each supported set of tags.
+ */
+ protected static ID3TagsAndAudio getAllTagHandlers(InputStream stream, ContentHandler handler)
+ throws IOException, SAXException, TikaException {
+ ID3v24Handler v24 = null;
+ ID3v23Handler v23 = null;
+ ID3v22Handler v22 = null;
+ ID3v1Handler v1 = null;
+ LyricsHandler lyrics = null;
+ AudioFrame firstAudio = null;
+
+ TailStream tailStream = new TailStream(stream, 10240+128);
+ MpegStream mpegStream = new MpegStream(tailStream);
+
+ // ID3v2 tags live at the start of the file
+ // You can apparently have several different ID3 tag blocks
+ // So, keep going until we don't find any more
+ MP3Frame f;
+ while ((f = ID3v2Frame.createFrameIfPresent(mpegStream)) != null) {
+ if(f instanceof ID3v2Frame) {
+ ID3v2Frame id3F = (ID3v2Frame)f;
+ if (id3F.getMajorVersion() == 4) {
+ v24 = new ID3v24Handler(id3F);
+ } else if(id3F.getMajorVersion() == 3) {
+ v23 = new ID3v23Handler(id3F);
+ } else if(id3F.getMajorVersion() == 2) {
+ v22 = new ID3v22Handler(id3F);
+ }
+ }
+ }
+
+ // Now iterate over all audio frames in the file
+ AudioFrame frame = mpegStream.nextFrame();
+ float duration = 0;
+ while (frame != null)
+ {
+ duration += frame.getDuration();
+ if (firstAudio == null)
+ {
+ firstAudio = frame;
+ }
+ mpegStream.skipFrame();
+ frame = mpegStream.nextFrame();
+ }
+
+ // ID3v1 tags live at the end of the file
+ // Lyrics live just before ID3v1, at the end of the file
+ // Search for both (handlers seek to the end for us)
+ lyrics = new LyricsHandler(tailStream.getTail());
+ v1 = lyrics.id3v1;
+
+ // Go in order of preference
+ // Currently, that's newest to oldest
+ List<ID3Tags> tags = new ArrayList<ID3Tags>();
+
+ if(v24 != null && v24.getTagsPresent()) {
+ tags.add(v24);
+ }
+ if(v23 != null && v23.getTagsPresent()) {
+ tags.add(v23);
+ }
+ if(v22 != null && v22.getTagsPresent()) {
+ tags.add(v22);
+ }
+ if(v1 != null && v1.getTagsPresent()) {
+ tags.add(v1);
+ }
+
+ ID3TagsAndAudio ret = new ID3TagsAndAudio();
+ ret.audio = firstAudio;
+ ret.lyrics = lyrics;
+ ret.tags = tags.toArray(new ID3Tags[tags.size()]);
+ ret.duration = duration;
+ return ret;
+ }
+
+ protected static class ID3TagsAndAudio {
+ private ID3Tags[] tags;
+ private AudioFrame audio;
+ private LyricsHandler lyrics;
+ private float duration;
+ }
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/MpegStream.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/MpegStream.java?rev=1722029&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/MpegStream.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/MpegStream.java Mon Dec 28 23:22:46 2015
@@ -0,0 +1,469 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.PushbackInputStream;
+
+/**
+ * <p>
+ * A specialized stream class which can be used to extract single frames of MPEG
+ * audio files.
+ * </p>
+ * <p>
+ * Instances of this class are constructed with an underlying stream which
+ * should point to an audio file. Read operations are possible in the usual way.
+ * However, there are special methods for searching and extracting headers of
+ * MPEG frames. Some meta information of frames can be queried.
+ * </p>
+ */
+class MpegStream extends PushbackInputStream
+{
+ /** Bit rate table for MPEG V1, layer 1. */
+ private static final int[] BIT_RATE_MPEG1_L1 = {
+ 0, 32000, 64000, 96000, 128000, 160000, 192000, 224000, 256000,
+ 288000, 320000, 352000, 384000, 416000, 448000
+ };
+
+ /** Bit rate table for MPEG V1, layer 2. */
+ private static final int[] BIT_RATE_MPEG1_L2 = {
+ 0, 32000, 48000, 56000, 64000, 80000, 96000, 112000, 128000,
+ 160000, 192000, 224000, 256000, 320000, 384000
+ };
+
+ /** Bit rate table for MPEG V1, layer 3. */
+ private static final int[] BIT_RATE_MPEG1_L3 = {
+ 0, 32000, 40000, 48000, 56000, 64000, 80000, 96000, 112000, 128000,
+ 160000, 192000, 224000, 256000, 320000
+ };
+
+ /** Bit rate table for MPEG V2/V2.5, layer 1. */
+ private static final int[] BIT_RATE_MPEG2_L1 = {
+ 0, 32000, 48000, 56000, 64000, 80000, 96000, 112000, 128000,
+ 144000, 160000, 176000, 192000, 224000, 256000
+ };
+
+ /** Bit rate table for MPEG V2/V2.5, layer 2 and 3. */
+ private static final int[] BIT_RATE_MPEG2_L2 = {
+ 0, 8000, 16000, 24000, 32000, 40000, 48000, 56000, 64000, 80000,
+ 96000, 112000, 128000, 144000, 160000
+ };
+
+ /** Sample rate table for MPEG V1. */
+ private static final int[] SAMPLE_RATE_MPEG1 = {
+ 44100, 48000, 32000
+ };
+
+ /** Sample rate table for MPEG V2. */
+ private static final int[] SAMPLE_RATE_MPEG2 = {
+ 22050, 24000, 16000
+ };
+
+ /** Sample rate table for MPEG V2.5. */
+ private static final int[] SAMPLE_RATE_MPEG2_5 = {
+ 11025, 12000, 8000
+ };
+
+ /** Sample rate table for all MPEG versions. */
+ private static final int[][] SAMPLE_RATE = createSampleRateTable();
+
+ /** Constant for the number of samples for a layer 1 frame. */
+ private static final int SAMPLE_COUNT_L1 = 384;
+
+ /** Constant for the number of samples for a layer 2 or 3 frame. */
+ private static final int SAMPLE_COUNT_L2 = 1152;
+
+ /** Constant for the size of an MPEG frame header in bytes. */
+ private static final int HEADER_SIZE = 4;
+
+ /** The current MPEG header. */
+ private AudioFrame currentHeader;
+
+ /** A flag whether the end of the stream is reached. */
+ private boolean endOfStream;
+
+ /**
+ * Creates a new instance of {@code MpegStream} and initializes it with the
+ * underlying stream.
+ *
+ * @param in the underlying audio stream
+ */
+ public MpegStream(InputStream in)
+ {
+ super(in, 2 * HEADER_SIZE);
+ }
+
+ /**
+ * Searches for the next MPEG frame header from the current stream position
+ * on. This method advances the underlying input stream until it finds a
+ * valid frame header or the end of the stream is reached. In the former
+ * case a corresponding {@code AudioFrame} object is created. In the latter
+ * case there are no more headers, so the end of the stream is probably
+ * reached.
+ *
+ * @return the next {@code AudioFrame} or <b>null</b>
+ * @throws IOException if an IO error occurs
+ */
+ public AudioFrame nextFrame() throws IOException
+ {
+ AudioFrame frame = null;
+ while (!endOfStream && frame == null)
+ {
+ findFrameSyncByte();
+ if (!endOfStream)
+ {
+ HeaderBitField headerField = createHeaderField();
+ if (!endOfStream)
+ {
+ frame = createHeader(headerField);
+ if (frame == null)
+ {
+ pushBack(headerField);
+ }
+ }
+ }
+ }
+
+ currentHeader = frame;
+ return frame;
+ }
+
+ /**
+ * Skips the current MPEG frame. This method can be called after a valid
+ * MPEG header has been retrieved using {@code nextFrame()}. In this case
+ * the underlying stream is advanced to the end of the associated MPEG
+ * frame. Otherwise, this method has no effect. The return value indicates
+ * whether a frame could be skipped.
+ *
+ * @return <b>true</b> if a frame could be skipped, <b>false</b> otherwise
+ * @throws IOException if an IO error occurs
+ */
+ public boolean skipFrame() throws IOException
+ {
+ if (currentHeader != null)
+ {
+ skipStream(in, currentHeader.getLength() - HEADER_SIZE);
+ currentHeader = null;
+ return true;
+ }
+ return false;
+ }
+
+ /**
+ * Advances the underlying stream until the first byte of frame sync is
+ * found.
+ *
+ * @throws IOException if an error occurs
+ */
+ private void findFrameSyncByte() throws IOException
+ {
+ boolean found = false;
+ while (!found && !endOfStream)
+ {
+ if (nextByte() == 0xFF)
+ {
+ found = true;
+ }
+ }
+ }
+
+ /**
+ * Creates a bit field for the MPEG frame header.
+ *
+ * @return the bit field
+ * @throws IOException if an error occurs
+ */
+ private HeaderBitField createHeaderField() throws IOException
+ {
+ HeaderBitField field = new HeaderBitField();
+ field.add(nextByte());
+ field.add(nextByte());
+ field.add(nextByte());
+ return field;
+ }
+
+ /**
+ * Creates an {@code AudioFrame} object based on the given header field. If
+ * the header field contains invalid values, result is <b>null</b>.
+ *
+ * @param bits the header bit field
+ * @return the {@code AudioFrame}
+ */
+ private AudioFrame createHeader(HeaderBitField bits)
+ {
+ if (bits.get(21, 23) != 7)
+ {
+ return null;
+ }
+
+ int mpegVer = bits.get(19, 20);
+ int layer = bits.get(17, 18);
+ int bitRateCode = bits.get(12, 15);
+ int sampleRateCode = bits.get(10, 11);
+ int padding = bits.get(9);
+
+ if (mpegVer == 1 || layer == 0 || bitRateCode == 0 || bitRateCode == 15
+ || sampleRateCode == 3)
+ {
+ // invalid header values
+ return null;
+ }
+
+ int bitRate = calculateBitRate(mpegVer, layer, bitRateCode);
+ int sampleRate = calculateSampleRate(mpegVer, sampleRateCode);
+ int length = calculateFrameLength(layer, bitRate, sampleRate, padding);
+ float duration = calculateDuration(layer, sampleRate);
+ int channels = calculateChannels(bits.get(6, 7));
+ return new AudioFrame(mpegVer, layer, bitRate, sampleRate, channels,
+ length, duration);
+ }
+
+ /**
+ * Reads the next byte.
+ *
+ * @return the next byte
+ * @throws IOException if an error occurs
+ */
+ private int nextByte() throws IOException
+ {
+ int result = 0;
+ if (!endOfStream)
+ {
+ result = read();
+ if (result == -1)
+ {
+ endOfStream = true;
+ }
+ }
+ return endOfStream ? 0 : result;
+ }
+
+ /**
+ * Pushes the given header field back in the stream so that the bytes are
+ * read again. This method is called if an invalid header was detected. Then
+ * search has to continue at the next byte after the frame sync byte.
+ *
+ * @param field the header bit field with the invalid frame header
+ * @throws IOException if an error occurs
+ */
+ private void pushBack(HeaderBitField field) throws IOException
+ {
+ unread(field.toArray());
+ }
+
+ /**
+ * Skips the given number of bytes from the specified input stream.
+ *
+ * @param in the input stream
+ * @param count the number of bytes to skip
+ * @throws IOException if an IO error occurs
+ */
+ private static void skipStream(InputStream in, long count)
+ throws IOException
+ {
+ long size = count;
+ long skipped = 0;
+ while (size > 0 && skipped >= 0)
+ {
+ skipped = in.skip(size);
+ if (skipped != -1)
+ {
+ size -= skipped;
+ }
+ }
+ }
+
+ /**
+ * Calculates the bit rate based on the given parameters.
+ *
+ * @param mpegVer the MPEG version
+ * @param layer the layer
+ * @param code the code for the bit rate
+ * @return the bit rate in bits per second
+ */
+ private static int calculateBitRate(int mpegVer, int layer, int code)
+ {
+ int[] arr = null;
+
+ if (mpegVer == AudioFrame.MPEG_V1)
+ {
+ switch (layer)
+ {
+ case AudioFrame.LAYER_1:
+ arr = BIT_RATE_MPEG1_L1;
+ break;
+ case AudioFrame.LAYER_2:
+ arr = BIT_RATE_MPEG1_L2;
+ break;
+ case AudioFrame.LAYER_3:
+ arr = BIT_RATE_MPEG1_L3;
+ break;
+ }
+ }
+ else
+ {
+ if (layer == AudioFrame.LAYER_1)
+ {
+ arr = BIT_RATE_MPEG2_L1;
+ }
+ else
+ {
+ arr = BIT_RATE_MPEG2_L2;
+ }
+ }
+ return arr[code];
+ }
+
+ /**
+ * Calculates the sample rate based on the given parameters.
+ *
+ * @param mpegVer the MPEG version
+ * @param code the code for the sample rate
+ * @return the sample rate in samples per second
+ */
+ private static int calculateSampleRate(int mpegVer, int code)
+ {
+ return SAMPLE_RATE[mpegVer][code];
+ }
+
+ /**
+ * Calculates the length of an MPEG frame based on the given parameters.
+ *
+ * @param layer the layer
+ * @param bitRate the bit rate
+ * @param sampleRate the sample rate
+ * @param padding the padding flag
+ * @return the length of the frame in bytes
+ */
+ private static int calculateFrameLength(int layer, int bitRate,
+ int sampleRate, int padding)
+ {
+ if (layer == AudioFrame.LAYER_1)
+ {
+ return (12 * bitRate / sampleRate + padding) * 4;
+ }
+ else
+ {
+ return 144 * bitRate / sampleRate + padding;
+ }
+ }
+
+ /**
+ * Calculates the duration of a MPEG frame based on the given parameters.
+ *
+ * @param layer the layer
+ * @param sampleRate the sample rate
+ * @return the duration of this frame in milliseconds
+ */
+ private static float calculateDuration(int layer, int sampleRate)
+ {
+ int sampleCount =
+ (layer == AudioFrame.LAYER_1) ? SAMPLE_COUNT_L1
+ : SAMPLE_COUNT_L2;
+ return (1000.0f / sampleRate) * sampleCount;
+ }
+
+ /**
+ * Calculates the number of channels based on the given parameters.
+ *
+ * @param chan the code for the channels
+ * @return the number of channels
+ */
+ private static int calculateChannels(int chan)
+ {
+ return chan < 3 ? 2 : 1;
+ }
+
+ /**
+ * Creates the complete array for the sample rate mapping.
+ *
+ * @return the table for the sample rates
+ */
+ private static int[][] createSampleRateTable()
+ {
+ int[][] arr = new int[4][];
+ arr[AudioFrame.MPEG_V1] = SAMPLE_RATE_MPEG1;
+ arr[AudioFrame.MPEG_V2] = SAMPLE_RATE_MPEG2;
+ arr[AudioFrame.MPEG_V2_5] = SAMPLE_RATE_MPEG2_5;
+ return arr;
+ }
+
+ /**
+ * A class representing the bit field of an MPEG header. It allows
+ * convenient access to specific bit groups.
+ */
+ private static class HeaderBitField
+ {
+ /** The internal value. */
+ private int value;
+
+ /**
+ * Adds a byte to this field.
+ *
+ * @param b the byte to be added
+ */
+ public void add(int b)
+ {
+ value <<= 8;
+ value |= b;
+ }
+
+ /**
+ * Returns the value of the bit group from the given start and end
+ * index. E.g. ''from'' = 0, ''to'' = 3 will return the value of the
+ * first 4 bits.
+ *
+ * @param the from index
+ * @param to the to index
+ * @return the value of this group of bits
+ */
+ public int get(int from, int to)
+ {
+ int shiftVal = value >> from;
+ int mask = (1 << (to - from + 1)) - 1;
+ return shiftVal & mask;
+ }
+
+ /**
+ * Returns the value of the bit with the given index. The bit index is
+ * 0-based. Result is either 0 or 1, depending on the value of this bit.
+ *
+ * @param bit the bit index
+ * @return the value of this bit
+ */
+ public int get(int bit)
+ {
+ return get(bit, bit);
+ }
+
+ /**
+ * Returns the internal value of this field as an array. The array
+ * contains 3 bytes.
+ *
+ * @return the internal value of this field as int array
+ */
+ public byte[] toArray()
+ {
+ byte[] result = new byte[3];
+ result[0] = (byte) get(16, 23);
+ result[1] = (byte) get(8, 15);
+ result[2] = (byte) get(0, 7);
+ return result;
+ }
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp4/DirectFileReadDataSource.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp4/DirectFileReadDataSource.java?rev=1722029&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp4/DirectFileReadDataSource.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp4/DirectFileReadDataSource.java Mon Dec 28 23:22:46 2015
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp4;
+
+import com.googlecode.mp4parser.DataSource;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.RandomAccessFile;
+import java.nio.ByteBuffer;
+import java.nio.channels.WritableByteChannel;
+
+import static com.googlecode.mp4parser.util.CastUtils.l2i;
+
+/**
+ * A {@link DataSource} implementation that relies on direct reads from a {@link RandomAccessFile}.
+ * It should be slower than {@link com.googlecode.mp4parser.FileDataSourceImpl} but does not incur the implicit file locks of
+ * memory mapped I/O on some JVMs. This implementation allows for a more controlled deletion of files
+ * and might be preferred when working with temporary files.
+ * @see <a href="http://bugs.java.com/view_bug.do?bug_id=4724038">JDK-4724038 : (fs) Add unmap method to MappedByteBuffer</a>
+ * @see <a href="http://bugs.java.com/view_bug.do?bug_id=6359560">JDK-6359560 : (fs) File.deleteOnExit() doesn't work when MappedByteBuffer exists (win)</a>
+ */
+public class DirectFileReadDataSource implements DataSource {
+
+ private static final int TRANSFER_SIZE = 8192;
+
+ private RandomAccessFile raf;
+
+ public DirectFileReadDataSource(File f) throws IOException {
+ this.raf = new RandomAccessFile(f, "r");
+ }
+
+ public int read(ByteBuffer byteBuffer) throws IOException {
+ int len = byteBuffer.remaining();
+ int totalRead = 0;
+ int bytesRead = 0;
+ byte[] buf = new byte[TRANSFER_SIZE];
+ while (totalRead < len) {
+ int bytesToRead = Math.min((len - totalRead), TRANSFER_SIZE);
+ bytesRead = raf.read(buf, 0, bytesToRead);
+ if (bytesRead < 0) {
+ break;
+ } else {
+ totalRead += bytesRead;
+ }
+ byteBuffer.put(buf, 0, bytesRead);
+ }
+ return ((bytesRead < 0) && (totalRead == 0)) ? -1 : totalRead;
+ }
+
+ public int readAllInOnce(ByteBuffer byteBuffer) throws IOException {
+ byte[] buf = new byte[byteBuffer.remaining()];
+ int read = raf.read(buf);
+ byteBuffer.put(buf, 0, read);
+ return read;
+ }
+
+ public long size() throws IOException {
+ return raf.length();
+ }
+
+ public long position() throws IOException {
+ return raf.getFilePointer();
+ }
+
+ public void position(long nuPos) throws IOException {
+ raf.seek(nuPos);
+ }
+
+ public long transferTo(long position, long count, WritableByteChannel target) throws IOException {
+ return target.write(map(position, count));
+ }
+
+ public ByteBuffer map(long startPosition, long size) throws IOException {
+ raf.seek(startPosition);
+ byte[] payload = new byte[l2i(size)];
+ raf.readFully(payload);
+ return ByteBuffer.wrap(payload);
+ }
+
+ public void close() throws IOException {
+ raf.close();
+ }
+
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp4/MP4Parser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp4/MP4Parser.java?rev=1722029&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp4/MP4Parser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp4/MP4Parser.java Mon Dec 28 23:22:46 2015
@@ -0,0 +1,325 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp4;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.text.DecimalFormat;
+import java.text.NumberFormat;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.XMP;
+import org.apache.tika.metadata.XMPDM;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import com.coremedia.iso.IsoFile;
+import com.coremedia.iso.boxes.Box;
+import com.coremedia.iso.boxes.Container;
+import com.coremedia.iso.boxes.FileTypeBox;
+import com.coremedia.iso.boxes.MetaBox;
+import com.coremedia.iso.boxes.MovieBox;
+import com.coremedia.iso.boxes.MovieHeaderBox;
+import com.coremedia.iso.boxes.SampleDescriptionBox;
+import com.coremedia.iso.boxes.SampleTableBox;
+import com.coremedia.iso.boxes.TrackBox;
+import com.coremedia.iso.boxes.TrackHeaderBox;
+import com.coremedia.iso.boxes.UserDataBox;
+import com.coremedia.iso.boxes.apple.AppleItemListBox;
+import com.coremedia.iso.boxes.sampleentry.AudioSampleEntry;
+import com.googlecode.mp4parser.boxes.apple.AppleAlbumBox;
+import com.googlecode.mp4parser.boxes.apple.AppleArtistBox;
+import com.googlecode.mp4parser.boxes.apple.AppleArtist2Box;
+import com.googlecode.mp4parser.boxes.apple.AppleCommentBox;
+import com.googlecode.mp4parser.boxes.apple.AppleCompilationBox;
+import com.googlecode.mp4parser.boxes.apple.AppleDiskNumberBox;
+import com.googlecode.mp4parser.boxes.apple.AppleEncoderBox;
+import com.googlecode.mp4parser.boxes.apple.AppleGenreBox;
+import com.googlecode.mp4parser.boxes.apple.AppleNameBox;
+import com.googlecode.mp4parser.boxes.apple.AppleRecordingYear2Box;
+import com.googlecode.mp4parser.boxes.apple.AppleTrackAuthorBox;
+import com.googlecode.mp4parser.boxes.apple.AppleTrackNumberBox;
+import com.googlecode.mp4parser.boxes.apple.Utf8AppleDataBox;
+
+/**
+ * Parser for the MP4 media container format, as well as the older
+ * QuickTime format that MP4 is based on.
+ *
+ * This uses the MP4Parser project from http://code.google.com/p/mp4parser/
+ * to do the underlying parsing
+ */
+public class MP4Parser extends AbstractParser {
+ /** Serial version UID */
+ private static final long serialVersionUID = 84011216792285L;
+ /** TODO Replace this with a 2dp Duration Property Converter */
+ private static final DecimalFormat DURATION_FORMAT =
+ (DecimalFormat)NumberFormat.getNumberInstance(Locale.ROOT);
+ static {
+ DURATION_FORMAT.applyPattern("0.0#");
+ }
+
+ // Ensure this stays in Sync with the entries in tika-mimetypes.xml
+ private static final Map<MediaType,List<String>> typesMap = new HashMap<MediaType, List<String>>();
+ static {
+ // All types should be 4 bytes long, space padded as needed
+ typesMap.put(MediaType.audio("mp4"), Arrays.asList(
+ "M4A ", "M4B ", "F4A ", "F4B "));
+ typesMap.put(MediaType.video("3gpp"), Arrays.asList(
+ "3ge6", "3ge7", "3gg6", "3gp1", "3gp2", "3gp3", "3gp4", "3gp5", "3gp6", "3gs7"));
+ typesMap.put(MediaType.video("3gpp2"), Arrays.asList(
+ "3g2a", "3g2b", "3g2c"));
+ typesMap.put(MediaType.video("mp4"), Arrays.asList(
+ "mp41", "mp42"));
+ typesMap.put(MediaType.video("x-m4v"), Arrays.asList(
+ "M4V ", "M4VH", "M4VP"));
+
+ typesMap.put(MediaType.video("quicktime"), Collections.<String>emptyList());
+ typesMap.put(MediaType.application("mp4"), Collections.<String>emptyList());
+ }
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.unmodifiableSet(typesMap.keySet());
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ IsoFile isoFile;
+
+ // The MP4Parser library accepts either a File, or a byte array
+ // As MP4 video files are typically large, always use a file to
+ // avoid OOMs that may occur with in-memory buffering
+ TemporaryResources tmp = new TemporaryResources();
+ TikaInputStream tstream = TikaInputStream.get(stream, tmp);
+ try {
+ isoFile = new IsoFile(new DirectFileReadDataSource(tstream.getFile()));
+ tmp.addResource(isoFile);
+
+ // Grab the file type box
+ FileTypeBox fileType = getOrNull(isoFile, FileTypeBox.class);
+ if (fileType != null) {
+ // Identify the type
+ MediaType type = MediaType.application("mp4");
+ for (MediaType t : typesMap.keySet()) {
+ if (typesMap.get(t).contains(fileType.getMajorBrand())) {
+ type = t;
+ break;
+ }
+ }
+ metadata.set(Metadata.CONTENT_TYPE, type.toString());
+
+ if (type.getType().equals("audio")) {
+ metadata.set(XMPDM.AUDIO_COMPRESSOR, fileType.getMajorBrand().trim());
+ }
+ } else {
+ // Some older QuickTime files lack the FileType
+ metadata.set(Metadata.CONTENT_TYPE, "video/quicktime");
+ }
+
+
+ // Get the main MOOV box
+ MovieBox moov = getOrNull(isoFile, MovieBox.class);
+ if (moov == null) {
+ // Bail out
+ return;
+ }
+
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+
+
+ // Pull out some information from the header box
+ MovieHeaderBox mHeader = getOrNull(moov, MovieHeaderBox.class);
+ if (mHeader != null) {
+ // Get the creation and modification dates
+ metadata.set(Metadata.CREATION_DATE, mHeader.getCreationTime());
+ metadata.set(TikaCoreProperties.MODIFIED, mHeader.getModificationTime());
+
+ // Get the duration
+ double durationSeconds = ((double)mHeader.getDuration()) / mHeader.getTimescale();
+ metadata.set(XMPDM.DURATION, DURATION_FORMAT.format(durationSeconds));
+
+ // The timescale is normally the sampling rate
+ metadata.set(XMPDM.AUDIO_SAMPLE_RATE, (int)mHeader.getTimescale());
+ }
+
+
+ // Get some more information from the track header
+ // TODO Decide how to handle multiple tracks
+ List<TrackBox> tb = moov.getBoxes(TrackBox.class);
+ if (tb.size() > 0) {
+ TrackBox track = tb.get(0);
+
+ TrackHeaderBox header = track.getTrackHeaderBox();
+ // Get the creation and modification dates
+ metadata.set(TikaCoreProperties.CREATED, header.getCreationTime());
+ metadata.set(TikaCoreProperties.MODIFIED, header.getModificationTime());
+
+ // Get the video with and height
+ metadata.set(Metadata.IMAGE_WIDTH, (int)header.getWidth());
+ metadata.set(Metadata.IMAGE_LENGTH, (int)header.getHeight());
+
+ // Get the sample information
+ SampleTableBox samples = track.getSampleTableBox();
+ SampleDescriptionBox sampleDesc = samples.getSampleDescriptionBox();
+ if (sampleDesc != null) {
+ // Look for the first Audio Sample, if present
+ AudioSampleEntry sample = getOrNull(sampleDesc, AudioSampleEntry.class);
+ if (sample != null) {
+ XMPDM.ChannelTypePropertyConverter.convertAndSet(metadata, sample.getChannelCount());
+ //metadata.set(XMPDM.AUDIO_SAMPLE_TYPE, sample.getSampleSize()); // TODO Num -> Type mapping
+ metadata.set(XMPDM.AUDIO_SAMPLE_RATE, (int)sample.getSampleRate());
+ //metadata.set(XMPDM.AUDIO_, sample.getSamplesPerPacket());
+ //metadata.set(XMPDM.AUDIO_, sample.getBytesPerSample());
+ }
+ }
+ }
+
+ // Get metadata from the User Data Box
+ UserDataBox userData = getOrNull(moov, UserDataBox.class);
+ if (userData != null) {
+ MetaBox meta = getOrNull(userData, MetaBox.class);
+
+ // Check for iTunes Metadata
+ // See http://atomicparsley.sourceforge.net/mpeg-4files.html and
+ // http://code.google.com/p/mp4v2/wiki/iTunesMetadata for more on these
+ AppleItemListBox apple = getOrNull(meta, AppleItemListBox.class);
+ if (apple != null) {
+ // Title
+ AppleNameBox title = getOrNull(apple, AppleNameBox.class);
+ addMetadata(TikaCoreProperties.TITLE, metadata, title);
+
+ // Artist
+ AppleArtistBox artist = getOrNull(apple, AppleArtistBox.class);
+ addMetadata(TikaCoreProperties.CREATOR, metadata, artist);
+ addMetadata(XMPDM.ARTIST, metadata, artist);
+
+ // Album Artist
+ AppleArtist2Box artist2 = getOrNull(apple, AppleArtist2Box.class);
+ addMetadata(XMPDM.ALBUM_ARTIST, metadata, artist2);
+
+ // Album
+ AppleAlbumBox album = getOrNull(apple, AppleAlbumBox.class);
+ addMetadata(XMPDM.ALBUM, metadata, album);
+
+ // Composer
+ AppleTrackAuthorBox composer = getOrNull(apple, AppleTrackAuthorBox.class);
+ addMetadata(XMPDM.COMPOSER, metadata, composer);
+
+ // Genre
+ AppleGenreBox genre = getOrNull(apple, AppleGenreBox.class);
+ addMetadata(XMPDM.GENRE, metadata, genre);
+
+ // Year
+ AppleRecordingYear2Box year = getOrNull(apple, AppleRecordingYear2Box.class);
+ if (year != null) {
+ metadata.set(XMPDM.RELEASE_DATE, year.getValue());
+ }
+
+ // Track number
+ AppleTrackNumberBox trackNum = getOrNull(apple, AppleTrackNumberBox.class);
+ if (trackNum != null) {
+ metadata.set(XMPDM.TRACK_NUMBER, trackNum.getA());
+ //metadata.set(XMPDM.NUMBER_OF_TRACKS, trackNum.getB()); // TODO
+ }
+
+ // Disc number
+ AppleDiskNumberBox discNum = getOrNull(apple, AppleDiskNumberBox.class);
+ if (discNum != null) {
+ metadata.set(XMPDM.DISC_NUMBER, discNum.getA());
+ }
+
+ // Compilation
+ AppleCompilationBox compilation = getOrNull(apple, AppleCompilationBox.class);
+ if (compilation != null) {
+ metadata.set(XMPDM.COMPILATION, (int)compilation.getValue());
+ }
+
+ // Comment
+ AppleCommentBox comment = getOrNull(apple, AppleCommentBox.class);
+ addMetadata(XMPDM.LOG_COMMENT, metadata, comment);
+
+ // Encoder
+ AppleEncoderBox encoder = getOrNull(apple, AppleEncoderBox.class);
+ if (encoder != null) {
+ metadata.set(XMP.CREATOR_TOOL, encoder.getValue());
+ }
+
+
+ // As text
+ for (Box box : apple.getBoxes()) {
+ if (box instanceof Utf8AppleDataBox) {
+ xhtml.element("p", ((Utf8AppleDataBox)box).getValue());
+ }
+ }
+ }
+
+ // TODO Check for other kinds too
+ }
+
+ // All done
+ xhtml.endDocument();
+
+ } finally {
+ tmp.dispose();
+ }
+
+ }
+
+ private static void addMetadata(String key, Metadata m, Utf8AppleDataBox metadata) {
+ if (metadata != null) {
+ m.add(key, metadata.getValue());
+ }
+ }
+ private static void addMetadata(Property prop, Metadata m, Utf8AppleDataBox metadata) {
+ if (metadata != null) {
+ m.set(prop, metadata.getValue());
+ }
+ }
+
+ private static <T extends Box> T getOrNull(Container box, Class<T> clazz) {
+ if (box == null) return null;
+
+ List<T> boxes = box.getBoxes(clazz);
+ if (boxes.size() == 0) {
+ return null;
+ }
+ return boxes.get(0);
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/video/FLVParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/video/FLVParser.java?rev=1722029&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/video/FLVParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/video/FLVParser.java Mon Dec 28 23:22:46 2015
@@ -0,0 +1,268 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.video;
+
+import java.io.ByteArrayInputStream;
+import java.io.DataInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+/**
+ * <p>
+ * Parser for metadata contained in Flash Videos (.flv). Resources:
+ * http://osflash.org/flv and for AMF:
+ * http://download.macromedia.com/pub/labs/amf/amf0_spec_121207.pdf
+ * <p>
+ * This parser is capable of extracting the general metadata from header as well
+ * as embedded metadata.
+ * <p>
+ * Known keys for metadata (from file header):
+ * <ol>
+ * <li>hasVideo: true|false
+ * <li>hasSound: true|false
+ * </ol>
+ * <p>
+ * In addition to the above values also metadata that is inserted in to the
+ * actual stream will be picked. Usually there are keys like:
+ * hasKeyframes, lastkeyframetimestamp, audiocodecid, keyframes, filepositions,
+ * hasMetadata, audiosamplerate, videodatarate metadatadate, videocodecid,
+ * metadatacreator, audiosize, hasVideo, height, audiosamplesize, framerate,
+ * hasCuePoints width, cuePoints, lasttimestamp, canSeekToEnd, datasize,
+ * duration, videosize, filesize, audiodatarate, hasAudio, stereo audiodelay
+ */
+public class FLVParser extends AbstractParser {
+
+ /** Serial version UID */
+ private static final long serialVersionUID = -8718013155719197679L;
+
+ private static int TYPE_METADATA = 0x12;
+ private static byte MASK_AUDIO = 1;
+ private static byte MASK_VIDEO = 4;
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.singleton(MediaType.video("x-flv"));
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ private long readUInt32(DataInputStream input) throws IOException {
+ return input.readInt() & 0xFFFFFFFFL;
+ }
+
+ private int readUInt24(DataInputStream input) throws IOException {
+ int uint = input.read()<<16;
+ uint += input.read()<<8;
+ uint += input.read();
+ return uint;
+ }
+
+ private Object readAMFData(DataInputStream input, int type)
+ throws IOException {
+ if (type == -1) {
+ type = input.readUnsignedByte();
+ }
+ switch (type) {
+ case 0:
+ return input.readDouble();
+ case 1:
+ return input.readUnsignedByte() == 1;
+ case 2:
+ return readAMFString(input);
+ case 3:
+ return readAMFObject(input);
+ case 8:
+ return readAMFEcmaArray(input);
+ case 10:
+ return readAMFStrictArray(input);
+ case 11:
+ final Date date = new Date((long) input.readDouble());
+ input.readShort(); // time zone
+ return date;
+ case 13:
+ return "UNDEFINED";
+ default:
+ return null;
+ }
+ }
+
+ private Object readAMFStrictArray(DataInputStream input) throws IOException {
+ long count = readUInt32(input);
+ ArrayList<Object> list = new ArrayList<Object>();
+ for (int i = 0; i < count; i++) {
+ list.add(readAMFData(input, -1));
+ }
+ return list;
+ }
+
+
+ private String readAMFString(DataInputStream input) throws IOException {
+ int size = input.readUnsignedShort();
+ byte[] chars = new byte[size];
+ input.readFully(chars);
+ return new String(chars, UTF_8);
+ }
+
+ private Object readAMFObject(DataInputStream input) throws IOException {
+ HashMap<String, Object> array = new HashMap<String, Object>();
+ while (true) {
+ String key = readAMFString(input);
+ int dataType = input.read();
+ if (dataType == 9) { // object end marker
+ break;
+ }
+ array.put(key, readAMFData(input, dataType));
+ }
+ return array;
+ }
+
+ private Object readAMFEcmaArray(DataInputStream input) throws IOException {
+ long size = readUInt32(input);
+ HashMap<String, Object> array = new HashMap<String, Object>();
+ for (int i = 0; i < size; i++) {
+ String key = readAMFString(input);
+ int dataType = input.read();
+ array.put(key, readAMFData(input, dataType));
+ }
+ return array;
+ }
+
+ private boolean checkSignature(DataInputStream fis) throws IOException {
+ return fis.read() == 'F' && fis.read() == 'L' && fis.read() == 'V';
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ DataInputStream datainput = new DataInputStream(stream);
+ if (!checkSignature(datainput)) {
+ throw new TikaException("FLV signature not detected");
+ }
+
+ // header
+ int version = datainput.readUnsignedByte();
+ if (version != 1) {
+ // should be 1, perhaps this is not flv?
+ throw new TikaException("Unpexpected FLV version: " + version);
+ }
+
+ int typeFlags = datainput.readUnsignedByte();
+
+ long len = readUInt32(datainput);
+ if (len != 9) {
+ // we only know about format with header of 9 bytes
+ throw new TikaException("Unpexpected FLV header length: " + len);
+ }
+
+ long sizePrev = readUInt32(datainput);
+ if (sizePrev != 0) {
+ // should be 0, perhaps this is not flv?
+ throw new TikaException(
+ "Unpexpected FLV first previous block size: " + sizePrev);
+ }
+
+ metadata.set(Metadata.CONTENT_TYPE, "video/x-flv");
+ metadata.set("hasVideo", Boolean.toString((typeFlags & MASK_VIDEO) != 0));
+ metadata.set("hasAudio", Boolean.toString((typeFlags & MASK_AUDIO) != 0));
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+
+ // flv tag stream follows...
+ while (true) {
+ int type = datainput.read();
+ if (type == -1) {
+ // EOF
+ break;
+ }
+
+ int datalen = readUInt24(datainput); //body length
+ readUInt32(datainput); // timestamp
+ readUInt24(datainput); // streamid
+
+ if (type == TYPE_METADATA) {
+ // found metadata Tag, read content to buffer
+ byte[] metaBytes = new byte[datalen];
+ for (int readCount = 0; readCount < datalen;) {
+ int r = stream.read(metaBytes, readCount, datalen - readCount);
+ if(r!=-1) {
+ readCount += r;
+
+ } else {
+ break;
+ }
+ }
+
+ ByteArrayInputStream is = new ByteArrayInputStream(metaBytes);
+
+ DataInputStream dis = new DataInputStream(is);
+
+ Object data = null;
+
+ for (int i = 0; i < 2; i++) {
+ data = readAMFData(dis, -1);
+ }
+
+ if (data instanceof Map) {
+ // TODO if there are multiple metadata values with same key (in
+ // separate AMF blocks, we currently loose previous values)
+ Map<String, Object> extractedMetadata = (Map<String, Object>) data;
+ for (Entry<String, Object> entry : extractedMetadata.entrySet()) {
+ if (entry.getValue() == null) {
+ continue;
+ }
+ metadata.set(entry.getKey(), entry.getValue().toString());
+ }
+ }
+
+ } else {
+ // Tag was not metadata, skip over data we cannot handle
+ for (int i = 0; i < datalen; i++) {
+ datainput.readByte();
+ }
+ }
+
+ sizePrev = readUInt32(datainput); // previous block size
+ if (sizePrev != datalen + 11) {
+ // file was corrupt or we could not parse it...
+ break;
+ }
+ }
+
+ xhtml.endDocument();
+ }
+
+}
Modified: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1722029&r1=1722028&r2=1722029&view=diff
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser (original)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser Mon Dec 28 23:22:46 2015
@@ -14,5 +14,14 @@
# limitations under the License.
+org.apache.tika.parser.image.BPGParser
org.apache.tika.parser.image.ImageParser
+org.apache.tika.parser.image.PSDParser
+org.apache.tika.parser.image.TiffParser
+org.apache.tika.parser.image.WebPParser
org.apache.tika.parser.jpeg.JpegParser
+org.apache.tika.parser.audio.AudioParser
+org.apache.tika.parser.audio.MidiParser
+org.apache.tika.parser.mp3.Mp3Parser
+org.apache.tika.parser.mp4.MP4Parser
+org.apache.tika.parser.video.FLVParser
Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/audio/AudioParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/audio/AudioParserTest.java?rev=1722029&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/audio/AudioParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/audio/AudioParserTest.java Mon Dec 28 23:22:46 2015
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.audio;
+
+import static org.junit.Assert.assertEquals;
+
+import org.apache.tika.Tika;
+import org.apache.tika.metadata.Metadata;
+import org.junit.Test;
+
+public class AudioParserTest {
+
+ @Test
+ public void testWAV() throws Exception {
+ String path = "/test-documents/testWAV.wav";
+ Metadata metadata = new Metadata();
+ String content = new Tika().parseToString(
+ AudioParserTest.class.getResourceAsStream(path), metadata);
+
+ assertEquals("audio/x-wav", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("44100.0", metadata.get("samplerate"));
+ assertEquals("2", metadata.get("channels"));
+ assertEquals("16", metadata.get("bits"));
+ assertEquals("PCM_SIGNED", metadata.get("encoding"));
+
+ assertEquals("", content);
+ }
+
+ @Test
+ public void testAIFF() throws Exception {
+ String path = "/test-documents/testAIFF.aif";
+ Metadata metadata = new Metadata();
+ String content = new Tika().parseToString(
+ AudioParserTest.class.getResourceAsStream(path), metadata);
+
+ assertEquals("audio/x-aiff", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("44100.0", metadata.get("samplerate"));
+ assertEquals("2", metadata.get("channels"));
+ assertEquals("16", metadata.get("bits"));
+ assertEquals("PCM_SIGNED", metadata.get("encoding"));
+
+ assertEquals("", content);
+ }
+
+ @Test
+ public void testAU() throws Exception {
+ String path = "/test-documents/testAU.au";
+ Metadata metadata = new Metadata();
+ String content = new Tika().parseToString(
+ AudioParserTest.class.getResourceAsStream(path), metadata);
+
+ assertEquals("audio/basic", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("44100.0", metadata.get("samplerate"));
+ assertEquals("2", metadata.get("channels"));
+ assertEquals("16", metadata.get("bits"));
+ assertEquals("PCM_SIGNED", metadata.get("encoding"));
+
+ assertEquals("", content);
+ }
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/audio/MidiParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/audio/MidiParserTest.java?rev=1722029&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/audio/MidiParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/audio/MidiParserTest.java Mon Dec 28 23:22:46 2015
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.audio;
+
+import static org.junit.Assert.assertEquals;
+import static org.apache.tika.TikaTest.assertContains;
+
+import org.apache.tika.Tika;
+import org.apache.tika.metadata.Metadata;
+import org.junit.Test;
+
+public class MidiParserTest {
+
+ @Test
+ public void testMID() throws Exception {
+ String path = "/test-documents/testMID.mid";
+ Metadata metadata = new Metadata();
+ String content = new Tika().parseToString(
+ MidiParserTest.class.getResourceAsStream(path), metadata);
+
+ assertEquals("audio/midi", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("2", metadata.get("tracks"));
+ assertEquals("0", metadata.get("patches"));
+ assertEquals("PPQ", metadata.get("divisionType"));
+
+ assertContains("Untitled", content);
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/image/BPGParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/image/BPGParserTest.java?rev=1722029&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/image/BPGParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/image/BPGParserTest.java Mon Dec 28 23:22:46 2015
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.image;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Photoshop;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.junit.Test;
+import org.xml.sax.helpers.DefaultHandler;
+
+public class BPGParserTest {
+ private final Parser parser = new BPGParser();
+
+ /**
+ * Tests a very basic file, without much metadata
+ */
+ @Test
+ public void testBPG() throws Exception {
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.CONTENT_TYPE, "image/x-bpg");
+ InputStream stream =
+ getClass().getResourceAsStream("/test-documents/testBPG.bpg");
+ parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
+
+ assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
+ assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH));
+ assertEquals("10", metadata.get(Metadata.BITS_PER_SAMPLE));
+ assertEquals("YCbCr Colour", metadata.get(Photoshop.COLOR_MODE));
+ }
+
+ /**
+ * Tests a file with comments
+ */
+ @Test
+ public void testBPG_Commented() throws Exception {
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.CONTENT_TYPE, "image/x-bpg");
+ InputStream stream =
+ getClass().getResourceAsStream("/test-documents/testBPG_commented.bpg");
+ parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
+
+ assertEquals("103", metadata.get(Metadata.IMAGE_WIDTH));
+ assertEquals("77", metadata.get(Metadata.IMAGE_LENGTH));
+ assertEquals("10", metadata.get(Metadata.BITS_PER_SAMPLE));
+ assertEquals("YCbCr Colour", metadata.get(Photoshop.COLOR_MODE));
+
+ // TODO Get the exif comment data to be properly extracted, see TIKA-1495
+ if (false) {
+ assertEquals("Tosteberga \u00C4ngar", metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)", metadata.get(TikaCoreProperties.DESCRIPTION));
+ List<String> keywords = Arrays.asList(metadata.getValues(Metadata.KEYWORDS));
+ assertTrue(keywords.contains("coast"));
+ assertTrue(keywords.contains("bird watching"));
+ assertEquals(keywords, Arrays.asList(metadata.getValues(TikaCoreProperties.KEYWORDS)));
+ }
+
+ // TODO Get the exif data to be properly extracted, see TIKA-1495
+ if (false) {
+ assertEquals("1.0E-6", metadata.get(Metadata.EXPOSURE_TIME)); // 1/1000000
+ assertEquals("2.8", metadata.get(Metadata.F_NUMBER));
+ assertEquals("4.6", metadata.get(Metadata.FOCAL_LENGTH));
+ assertEquals("114", metadata.get(Metadata.ISO_SPEED_RATINGS));
+ assertEquals(null, metadata.get(Metadata.EQUIPMENT_MAKE));
+ assertEquals(null, metadata.get(Metadata.EQUIPMENT_MODEL));
+ assertEquals(null, metadata.get(Metadata.SOFTWARE));
+ assertEquals("1", metadata.get(Metadata.ORIENTATION));
+ assertEquals("300.0", metadata.get(Metadata.RESOLUTION_HORIZONTAL));
+ assertEquals("300.0", metadata.get(Metadata.RESOLUTION_VERTICAL));
+ assertEquals("Inch", metadata.get(Metadata.RESOLUTION_UNIT));
+ }
+ }
+
+ /**
+ * Tests a file with geographic information in it
+ */
+ @Test
+ public void testBPG_Geo() throws Exception {
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.CONTENT_TYPE, "image/x-bpg");
+ InputStream stream =
+ getClass().getResourceAsStream("/test-documents/testBPG_GEO.bpg");
+ parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
+
+ assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
+ assertEquals("68", metadata.get(Metadata.IMAGE_LENGTH));
+ assertEquals("10", metadata.get(Metadata.BITS_PER_SAMPLE));
+ assertEquals("YCbCr Colour", metadata.get(Photoshop.COLOR_MODE));
+
+ // TODO Get the geographic data to be properly extracted, see TIKA-1495
+ if (false) {
+ assertEquals("12.54321", metadata.get(Metadata.LATITUDE));
+ assertEquals("-54.1234", metadata.get(Metadata.LONGITUDE));
+ }
+
+ // TODO Get the exif data to be properly extracted, see TIKA-1495
+ if (false) {
+ assertEquals("6.25E-4", metadata.get(Metadata.EXPOSURE_TIME)); // 1/1600
+ assertEquals("5.6", metadata.get(Metadata.F_NUMBER));
+ assertEquals("false", metadata.get(Metadata.FLASH_FIRED));
+ assertEquals("194.0", metadata.get(Metadata.FOCAL_LENGTH));
+ assertEquals("400", metadata.get(Metadata.ISO_SPEED_RATINGS));
+ assertEquals("Canon", metadata.get(Metadata.EQUIPMENT_MAKE));
+ assertEquals("Canon EOS 40D", metadata.get(Metadata.EQUIPMENT_MODEL));
+ assertEquals("Adobe Photoshop CS3 Macintosh", metadata.get(Metadata.SOFTWARE));
+ assertEquals("240.0", metadata.get(Metadata.RESOLUTION_HORIZONTAL));
+ assertEquals("240.0", metadata.get(Metadata.RESOLUTION_VERTICAL));
+ assertEquals("Inch", metadata.get(Metadata.RESOLUTION_UNIT));
+ }
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/image/ImageMetadataExtractorTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/image/ImageMetadataExtractorTest.java?rev=1722029&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/image/ImageMetadataExtractorTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/test/java/org/apache/tika/parser/image/ImageMetadataExtractorTest.java Mon Dec 28 23:22:46 2015
@@ -0,0 +1,139 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.image;
+
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.verify;
+import static org.mockito.Mockito.when;
+
+import java.util.Arrays;
+import java.util.GregorianCalendar;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Locale;
+import java.util.TimeZone;
+
+import com.drew.metadata.Directory;
+import com.drew.metadata.MetadataException;
+import com.drew.metadata.Tag;
+import com.drew.metadata.exif.ExifIFD0Directory;
+import com.drew.metadata.exif.ExifSubIFDDirectory;
+import com.drew.metadata.jpeg.JpegCommentDirectory;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.junit.Test;
+
+public class ImageMetadataExtractorTest {
+
+ @SuppressWarnings({"rawtypes", "unchecked"})
+ @Test
+ public void testHandleDirectories() throws MetadataException {
+ Metadata metadata = mock(Metadata.class);
+ ImageMetadataExtractor.DirectoryHandler handler1 = mock(ImageMetadataExtractor.DirectoryHandler.class);
+ ImageMetadataExtractor e = new ImageMetadataExtractor(metadata, handler1);
+
+ Directory directory = new JpegCommentDirectory();
+ Iterator directories = mock(Iterator.class);
+ when(directories.hasNext()).thenReturn(true, false);
+ when(directories.next()).thenReturn(directory);
+ when(handler1.supports(JpegCommentDirectory.class)).thenReturn(true);
+
+ e.handle(directories);
+ verify(handler1).supports(JpegCommentDirectory.class);
+ verify(handler1).handle(directory, metadata);
+ }
+
+ @Test
+ public void testExifHandlerSupports() {
+ assertTrue(new ImageMetadataExtractor.ExifHandler().supports(ExifIFD0Directory.class));
+ assertTrue(new ImageMetadataExtractor.ExifHandler().supports(ExifSubIFDDirectory.class));
+ assertFalse(new ImageMetadataExtractor.ExifHandler().supports(Directory.class));
+ assertFalse(new ImageMetadataExtractor.ExifHandler().supports(JpegCommentDirectory.class));
+ }
+
+ @Test
+ public void testExifHandlerParseDate() throws MetadataException {
+ ExifSubIFDDirectory exif = mock(ExifSubIFDDirectory.class);
+ when(exif.containsTag(ExifSubIFDDirectory.TAG_DATETIME_ORIGINAL)).thenReturn(true);
+ GregorianCalendar calendar = new GregorianCalendar(TimeZone.getDefault(), Locale.ROOT);
+ calendar.setTimeInMillis(0);
+ calendar.set(2000, 0, 1, 0, 0, 0);
+ when(exif.getDate(ExifSubIFDDirectory.TAG_DATETIME_ORIGINAL)).thenReturn(
+ calendar.getTime()); // jvm default timezone as in Metadata Extractor
+ Metadata metadata = new Metadata();
+
+ new ImageMetadataExtractor.ExifHandler().handle(exif, metadata);
+ assertEquals("Should be ISO date without time zone", "2000-01-01T00:00:00",
+ metadata.get(TikaCoreProperties.CREATED));
+ }
+
+ @Test
+ public void testExifHandlerParseDateFallback() throws MetadataException {
+ ExifIFD0Directory exif = mock(ExifIFD0Directory.class);
+ when(exif.containsTag(ExifIFD0Directory.TAG_DATETIME)).thenReturn(true);
+ GregorianCalendar calendar = new GregorianCalendar(TimeZone.getDefault(), Locale.ROOT);
+ calendar.setTimeInMillis(0);
+ calendar.set(1999, 0, 1, 0, 0, 0);
+ when(exif.getDate(ExifIFD0Directory.TAG_DATETIME)).thenReturn(
+ calendar.getTime()); // jvm default timezone as in Metadata Extractor
+ Metadata metadata = new Metadata();
+
+ new ImageMetadataExtractor.ExifHandler().handle(exif, metadata);
+ assertEquals("Should try EXIF Date/Time if Original is not set", "1999-01-01T00:00:00",
+ metadata.get(TikaCoreProperties.CREATED));
+ }
+
+ @Test
+ public void testExifHandlerParseDateError() throws MetadataException {
+ ExifIFD0Directory exif = mock(ExifIFD0Directory.class);
+ when(exif.containsTag(ExifSubIFDDirectory.TAG_DATETIME_ORIGINAL)).thenReturn(true);
+ when(exif.getDate(ExifSubIFDDirectory.TAG_DATETIME_ORIGINAL)).thenReturn(null);
+ Metadata metadata = new Metadata();
+
+ new ImageMetadataExtractor.ExifHandler().handle(exif, metadata);
+ assertEquals("Parsing should proceed without date", null,
+ metadata.get(TikaCoreProperties.CREATED));
+ }
+
+ @Test
+ public void testCopyUnknownFieldsHandler() throws MetadataException {
+ Directory d = mock(Directory.class);
+ Tag t1 = mock(Tag.class);
+ when(t1.getTagName()).thenReturn("Image Description");
+ when(t1.getDescription()).thenReturn("t1");
+ Tag t2 = mock(Tag.class);
+ when(t2.getTagName()).thenReturn(Metadata.KEYWORDS);
+ when(t2.getDescription()).thenReturn("known");
+ Tag t3 = mock(Tag.class);
+ when(t3.getTagName()).thenReturn(TikaCoreProperties.DESCRIPTION.getName());
+ when(t3.getDescription()).thenReturn("known");
+ List<Tag> tags = Arrays.asList(t1, t2, t3);
+ when(d.getTags()).thenReturn(tags);
+ Metadata metadata = new Metadata();
+ new ImageMetadataExtractor.CopyUnknownFieldsHandler().handle(d, metadata);
+ assertEquals("t1", metadata.get("Image Description"));
+ assertNull("keywords should be excluded from bulk copy because it is a defined field",
+ metadata.get(Metadata.KEYWORDS));
+ assertNull(metadata.get(TikaCoreProperties.DESCRIPTION));
+ }
+
+}