You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2009/12/14 23:17:28 UTC

svn commit: r890503 - in /lucene/tika/trunk: tika-core/src/main/resources/org/apache/tika/ tika-parsers/src/main/java/org/apache/tika/parser/video/ tika-parsers/src/test/java/org/apache/tika/parser/video/ tika-parsers/src/test/resources/test-documents/

Author: jukka
Date: Mon Dec 14 22:17:28 2009
New Revision: 890503

URL: http://svn.apache.org/viewvc?rev=890503&view=rev
Log:
TIKA-328: Add parser for .flv videos

Patch by Sami Siren

Added:
    lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/video/
    lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/video/FLVParser.java
    lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/video/
    lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/video/FLVParserTest.java
    lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testFLV.flv   (with props)
Modified:
    lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/tika-config.xml

Modified: lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/tika-config.xml
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/tika-config.xml?rev=890503&r1=890502&r2=890503&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/tika-config.xml (original)
+++ lucene/tika/trunk/tika-core/src/main/resources/org/apache/tika/tika-config.xml Mon Dec 14 22:17:28 2009
@@ -172,6 +172,10 @@
                 <mime>application/epub+zip</mime>
         </parser>
 
+        <parser name="parse-flv" class="org.apache.tika.parser.video.FLVParser">
+                <mime>video/x-flv</mime>
+        </parser>
+
     </parsers>
 
 </properties>
\ No newline at end of file

Added: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/video/FLVParser.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/video/FLVParser.java?rev=890503&view=auto
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/video/FLVParser.java (added)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/video/FLVParser.java Mon Dec 14 22:17:28 2009
@@ -0,0 +1,254 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.video;
+
+import java.io.ByteArrayInputStream;
+import java.io.DataInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Map.Entry;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * <p>
+ * Parser for metadata contained in Flash Videos (.flv). Resources:
+ * http://osflash.org/flv and for AMF:
+ * http://download.macromedia.com/pub/labs/amf/amf0_spec_121207.pdf
+ * <p>
+ * This parser is capable of extracting the general metadata from header as well
+ * as embedded metadata.
+ * <p>
+ * Known keys for metadata (from file header):
+ * <ol>
+ * <li>hasVideo: true|false
+ * <li>hasSound: true|false
+ * </ol>
+ * <p>
+ * In addition to the above values also metadata that is inserted in to the
+ * actual stream will be picked. Usually there are keys like:
+ * hasKeyframes, lastkeyframetimestamp, audiocodecid, keyframes, filepositions,
+ * hasMetadata, audiosamplerate, videodatarate metadatadate, videocodecid,
+ * metadatacreator, audiosize, hasVideo, height, audiosamplesize, framerate,
+ * hasCuePoints width, cuePoints, lasttimestamp, canSeekToEnd, datasize,
+ * duration, videosize, filesize, audiodatarate, hasAudio, stereo audiodelay
+ */
+public class FLVParser implements Parser {
+
+    private static int TYPE_METADATA = 0x12;
+    private static byte MASK_AUDIO = 1;
+    private static byte MASK_VIDEO = 4;
+
+    private long readUInt32(DataInputStream input) throws IOException {
+        return input.readInt() & 0xFFFFFFFFL;
+    }
+
+    private int readUInt24(DataInputStream input) throws IOException {
+        int uint = input.read()<<16;
+        uint += input.read()<<8;
+        uint += input.read(); 
+        return uint;
+    }
+
+    private Object readAMFData(DataInputStream input, int type)
+            throws IOException {
+        if (type == -1) {
+            type = input.readUnsignedByte();
+        }
+        switch (type) {
+        case 0:
+            return input.readDouble();
+        case 1:
+            return input.readUnsignedByte() == 1;
+        case 2:
+            return readAMFString(input);
+        case 3:
+            return readAMFObject(input);
+        case 8:
+            return readAMFEcmaArray(input);
+        case 10:
+            return readAMFStrictArray(input);
+        case 11:
+            final Date date = new Date((long) input.readDouble());
+            input.skip(2); // time zone
+            return date;
+        case 13:
+            return "UNDEFINED";
+        default:
+            return null;
+        }
+    }
+
+    private Object readAMFStrictArray(DataInputStream input) throws IOException {
+        long count = readUInt32(input);
+        ArrayList<Object> list = new ArrayList<Object>();
+        for (int i = 0; i < count; i++) {
+            list.add(readAMFData(input, -1));
+        }
+        return list;
+    }
+
+
+    private String readAMFString(DataInputStream input) throws IOException {
+        int size = input.readUnsignedShort();
+        byte[] chars = new byte[size];
+        input.readFully(chars);
+        String value = new String(chars);
+        return value;
+    }
+
+    private Object readAMFObject(DataInputStream input) throws IOException {
+        HashMap<String, Object> array = new HashMap<String, Object>();
+        while (true) {
+            String key = readAMFString(input);
+            int dataType = input.read();
+            if (dataType == 9) { // object end marker
+                break;
+            }
+            array.put(key, readAMFData(input, dataType));
+        }
+        return array;
+    }
+
+    private Object readAMFEcmaArray(DataInputStream input) throws IOException {
+        long size = readUInt32(input);
+        HashMap<String, Object> array = new HashMap<String, Object>();
+        for (int i = 0; i < size; i++) {
+            String key = readAMFString(input);
+            int dataType = input.read();
+            array.put(key, readAMFData(input, dataType));
+        }
+        return array;
+    }
+
+    private boolean checkSignature(DataInputStream fis) throws IOException {
+        return fis.read() == 'F' && fis.read() == 'L' && fis.read() == 'V';
+    }
+
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+
+        DataInputStream datainput = new DataInputStream(stream);
+        if (!checkSignature(datainput)) {
+            throw new TikaException("FLV signature not detected");
+        }
+
+        // header
+        int version = datainput.readUnsignedByte();
+        if (version != 1) {
+            // should be 1, perhaps this is not flv?
+            return;
+        }
+
+        int typeFlags = datainput.readUnsignedByte();
+        metadata.add("hasVideo", Boolean.toString((typeFlags & MASK_VIDEO) != 0));
+        metadata.add("hasAudio", Boolean.toString((typeFlags & MASK_AUDIO) != 0));
+
+        long len = readUInt32(datainput);
+        if (len != 9) {
+            // we only know about format with header of 9 bytes
+            return;
+        }
+
+        long sizePrev = readUInt32(datainput);
+        if (sizePrev != 0) {
+            // should be 0, perhaps this is not flv?
+            return;
+        }
+
+        // flv tag stream follows...
+        while (true) {
+            int type = datainput.read();
+            if (type == -1) {
+                // EOF
+                break;
+            }
+
+            int datalen = readUInt24(datainput); //body length
+            stream.skip(4); // timestamp
+            stream.skip(3); // streamid
+
+            if (type == TYPE_METADATA) {
+                // found metadata Tag, read content to buffer
+                byte[] metaBytes = new byte[datalen];
+                for (int readCount = 0; readCount < datalen;) {
+                    int r = stream.read(metaBytes, readCount, datalen - readCount);
+                    if(r!=-1) {
+                        readCount += r;
+
+                    } else {
+                        break;
+                    }
+                }
+
+                ByteArrayInputStream is = new ByteArrayInputStream(metaBytes);
+
+                DataInputStream dis = new DataInputStream(is);
+
+                Object data = null;
+
+                for (int i = 0; i < 2; i++) {
+                    data = readAMFData(dis, -1);
+                }
+
+                if (data instanceof Map) {
+                    // TODO if there are multiple metadata values with same key (in
+                    // separate AMF blocks, we currently loose previous values)
+                    Map<String, Object> extractedMetadata = (Map<String, Object>) data;
+                    for (Entry<String, Object> entry : extractedMetadata.entrySet()) {
+                        metadata.set(entry.getKey(), entry.getValue().toString());
+                    }
+                }
+
+            } else {
+                // Tag was not metadata, skip over data we cannot handle
+                for (int skiplen = 0; skiplen < datalen;) {
+                    long currentSkipLen = datainput.skip(datalen - skiplen);
+                    skiplen += currentSkipLen;
+                }
+            }
+
+            sizePrev = readUInt32(datainput); // previous block size
+            if (sizePrev != datalen + 11) {
+                // file was corrupt or we could not parse it...
+                break;
+            }
+        }
+
+        xhtml.endDocument();
+    }
+
+    public void parse(InputStream stream, ContentHandler handler,
+            Metadata metadata) throws IOException, SAXException, TikaException {
+        parse(stream, handler, metadata, null);
+    }
+
+}

Added: lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/video/FLVParserTest.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/video/FLVParserTest.java?rev=890503&view=auto
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/video/FLVParserTest.java (added)
+++ lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/video/FLVParserTest.java Mon Dec 14 22:17:28 2009
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.video;
+
+import junit.framework.TestCase;
+
+import org.apache.tika.Tika;
+import org.apache.tika.metadata.Metadata;
+
+public class FLVParserTest extends TestCase {
+
+    public void testFLV() throws Exception {
+        String path = "/test-documents/testFLV.flv";
+        Metadata metadata = new Metadata();
+
+        String content = new Tika().parseToString(FLVParserTest.class.
+                getResourceAsStream(path), metadata);
+
+        System.out.println(metadata);
+        assertEquals("video/x-flv", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("true", metadata.get("hasVideo"));
+        assertEquals("false", metadata.get("stereo"));
+        assertEquals("true", metadata.get("hasAudio"));
+        assertEquals("120.0", metadata.get("height"));
+        assertEquals("16.0", metadata.get("audiosamplesize"));
+    }
+
+}

Added: lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testFLV.flv
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testFLV.flv?rev=890503&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testFLV.flv
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream