You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2009/02/08 22:16:12 UTC

svn commit: r742169 - in /lucene/tika/trunk: CHANGES.txt src/main/java/org/apache/tika/parser/audio/MidiParser.java

Author: jukka
Date: Sun Feb  8 21:16:12 2009
New Revision: 742169

URL: http://svn.apache.org/viewvc?rev=742169&view=rev
Log:
TIKA-201: Extract lyrics and other text from MIDI audio files

MIDI meta text events are now extracted and written out in a <p> element per track.

Modified:
    lucene/tika/trunk/CHANGES.txt
    lucene/tika/trunk/src/main/java/org/apache/tika/parser/audio/MidiParser.java

Modified: lucene/tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/CHANGES.txt?rev=742169&r1=742168&r2=742169&view=diff
==============================================================================
--- lucene/tika/trunk/CHANGES.txt (original)
+++ lucene/tika/trunk/CHANGES.txt Sun Feb  8 21:16:12 2009
@@ -22,6 +22,9 @@
     --text argument. This prevents the end of the text output from being
     lost. (TIKA-179)
 
+  * Embedded text in MIDI files is now extracted. For example many karaoke
+    files contain song lyrics embedded as MIDI text.
+
 See http://tinyurl.com/tika-0-3-changes for a list of all changes in Tika 0.3.
 
 The following people have contributed to Tika 0.3 by submitting or commenting

Modified: lucene/tika/trunk/src/main/java/org/apache/tika/parser/audio/MidiParser.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/audio/MidiParser.java?rev=742169&r1=742168&r2=742169&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/parser/audio/MidiParser.java (original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/parser/audio/MidiParser.java Sun Feb  8 21:16:12 2009
@@ -21,6 +21,8 @@
 import java.io.InputStream;
 
 import javax.sound.midi.InvalidMidiDataException;
+import javax.sound.midi.MetaMessage;
+import javax.sound.midi.MidiMessage;
 import javax.sound.midi.MidiSystem;
 import javax.sound.midi.Patch;
 import javax.sound.midi.Sequence;
@@ -66,6 +68,23 @@
             } else if (type == Sequence.SMPTE_24) {
                 metadata.set("divisionType", String.valueOf(type));
             }
+
+            for (Track track : tracks) {
+                xhtml.startElement("p");
+                for (int i = 0; i < track.size(); i++) {
+                    MidiMessage message = track.get(i).getMessage();
+                    if (message instanceof MetaMessage) {
+                        MetaMessage meta = (MetaMessage) message;
+                        // Types 1-15 are reserved for text events
+                        if (meta.getType() >= 1 && meta.getType() <= 15) {
+                            // FIXME: What's the encoding?
+                            xhtml.characters(
+                                    new String(meta.getData(), "ISO-8859-1"));
+                        }
+                    }
+                }
+                xhtml.endElement("p");
+            }
         } catch (InvalidMidiDataException ignore) {
             // There is no way to know whether this exception was
             // caused by the document being corrupted or by the format