You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2010/06/28 19:52:17 UTC

svn commit: r958659 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/mp3/ test/java/org/apache/tika/parser/mp3/ test/resources/test-documents/

Author: nick
Date: Mon Jun 28 17:52:17 2010
New Revision: 958659

URL: http://svn.apache.org/viewvc?rev=958659&view=rev
Log:
MP3 Lyrics text extraction support
Updates the MP3 parser to detect a LyricsV3 block before the ID3v1 tags block. If found, the lyrics text will be captured and output.

Added:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/LyricsHandler.java   (with props)
    tika/trunk/tika-parsers/src/test/resources/test-documents/testMP3lyrics.mp3   (with props)
Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java?rev=958659&r1=958658&r2=958659&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java Mon Jun 28 17:52:17 2010
@@ -43,7 +43,7 @@ public class ID3v1Handler implements ID3
 
     public ID3v1Handler(InputStream stream, ContentHandler handler)
             throws IOException, SAXException, TikaException {
-        this(getSuffix(stream, 128));
+        this(LyricsHandler.getSuffix(stream, 128));
     }
 
     /**
@@ -155,37 +155,4 @@ public class ID3v1Handler implements ID3
             throw new TikaException("ISO-8859-1 encoding is not available", e);
         }
     }
-
-    /**
-     * Reads and returns the last <code>length</code> bytes from the
-     * given stream.
-     * @param stream input stream
-     * @param length number of bytes from the end to read and return
-     * @return stream the <code>InputStream</code> to read from.
-     * @throws IOException if the stream could not be read from.
-     */
-    private static byte[] getSuffix(InputStream stream, int length)
-            throws IOException {
-        byte[] buffer = new byte[2 * length];
-        int bytesInBuffer = 0;
-
-        int n = stream.read(buffer);
-        while (n != -1) {
-            bytesInBuffer += n;
-            if (bytesInBuffer == buffer.length) {
-                System.arraycopy(buffer, bytesInBuffer - length, buffer, 0, length);
-                bytesInBuffer = length;
-            }
-            n = stream.read(buffer, bytesInBuffer, buffer.length - bytesInBuffer);
-        }
-
-        if (bytesInBuffer < length) {
-            length = bytesInBuffer;
-        }
-
-        byte[] result = new byte[length];
-        System.arraycopy(buffer, bytesInBuffer - length, result, 0, length);
-        return result;
-    }
-
 }

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java?rev=958659&r1=958658&r2=958659&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java Mon Jun 28 17:52:17 2010
@@ -86,7 +86,7 @@ public class ID3v2Frame implements MP3Fr
         if (AudioFrame.isAudioHeader(h1, h2, h3, h4)) {
             return new AudioFrame(h1, h2, h3, h4, inp);
         }
-
+        
         // Not a frame header
         return null;
     }

Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/LyricsHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/LyricsHandler.java?rev=958659&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/LyricsHandler.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/LyricsHandler.java Mon Jun 28 17:52:17 2010
@@ -0,0 +1,146 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.exception.TikaException;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * This is used to parse Lyrics3 tag information
+ *  from an MP3 file, if available.
+ * Handles lyrics tags of up to 10kb in size.
+ * Will process any ID3v1 tag data if present.
+ * Ignores extended ID3v1 data in the lyrics block
+ *
+ * @see <a href="http://www.id3.org/Lyrics3v2">Lyrics3 v2.0 specification</a>
+ */
+public class LyricsHandler {
+    boolean foundLyrics = false;
+    String lyricsText = null;
+    ID3v1Handler id3v1 = null;
+
+    public LyricsHandler(InputStream stream, ContentHandler handler)
+            throws IOException, SAXException, TikaException {
+        this(getSuffix(stream, 10240+128));
+    }
+
+    /**
+     * Looks for the Lyrics data, which will be
+     *  just before the ID3v1 data (if present),
+     *  and process it.
+     * Also sets things up for the ID3v1
+     *  processing if required.
+     * Creates from the last 128 bytes of a stream.
+     */
+    protected LyricsHandler(byte[] tagData)
+            throws IOException, SAXException, TikaException {
+        if(tagData.length < 128) {
+            return;
+        }
+
+        // Is there ID3v1 data?
+        byte[] last128 = new byte[128];
+        System.arraycopy(tagData, tagData.length-128, last128, 0, 128);
+        id3v1 = new ID3v1Handler(last128);
+
+        if(tagData.length < 137) {
+            return;
+        }
+
+        // Are there lyrics?
+        int lookat = tagData.length - 9;
+        if(id3v1.found) {
+            lookat -= 128;
+        }
+        if(tagData[lookat+0] == 'L' && tagData[lookat+1] == 'Y' && 
+                tagData[lookat+2] == 'R' && tagData[lookat+3] == 'I' &&
+                tagData[lookat+4] == 'C' && tagData[lookat+5] == 'S' &&
+                tagData[lookat+6] == '2' && tagData[lookat+7] == '0' &&
+                tagData[lookat+8] == '0') {
+            foundLyrics = true;
+
+            int length = Integer.parseInt(
+                    new String(tagData, lookat-6, 6)
+            );
+
+            String lyrics = new String(
+                    tagData, lookat-length+5, length-11,
+                    "ASCII"
+            );
+
+            int pos = 0;
+            while(pos < lyrics.length()-9) {
+                String tagName = lyrics.substring(pos, pos+3);
+                int tagLen = Integer.parseInt(
+                        lyrics.substring(pos+3, pos+9)
+                );
+
+                if(tagName.equals("LYR")) {
+                    lyricsText = lyrics.substring(pos+9, pos+9+tagLen);
+                }
+
+                pos += tagLen;
+            }
+        }
+    }
+
+    public boolean hasID3v1() {
+        if(id3v1 == null || id3v1.found == false) {
+            return false;
+        }
+        return true;
+    }
+    public boolean hasLyrics() {
+        return lyricsText != null && lyricsText.length() > 0;
+    }
+
+    /**
+     * Reads and returns the last <code>length</code> bytes from the
+     * given stream.
+     * @param stream input stream
+     * @param length number of bytes from the end to read and return
+     * @return stream the <code>InputStream</code> to read from.
+     * @throws IOException if the stream could not be read from.
+     */
+    protected static byte[] getSuffix(InputStream stream, int length)
+            throws IOException {
+        byte[] buffer = new byte[2 * length];
+        int bytesInBuffer = 0;
+
+        int n = stream.read(buffer);
+        while (n != -1) {
+            bytesInBuffer += n;
+            if (bytesInBuffer == buffer.length) {
+                System.arraycopy(buffer, bytesInBuffer - length, buffer, 0, length);
+                bytesInBuffer = length;
+            }
+            n = stream.read(buffer, bytesInBuffer, buffer.length - bytesInBuffer);
+        }
+
+        if (bytesInBuffer < length) {
+            length = bytesInBuffer;
+        }
+
+        byte[] result = new byte[length];
+        System.arraycopy(buffer, bytesInBuffer - length, result, 0, length);
+        return result;
+    }
+}

Propchange: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/LyricsHandler.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java?rev=958659&r1=958658&r2=958659&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java Mon Jun 28 17:52:17 2010
@@ -97,6 +97,9 @@ public class Mp3Parser implements Parser
                     XMPDM.AUDIO_SAMPLE_RATE,
                     Integer.toString(audioAndTags.audio.getSampleRate()));
         }
+        if (audioAndTags.lyrics != null && audioAndTags.lyrics.hasLyrics()) {
+        	xhtml.element("p", audioAndTags.lyrics.lyricsText);
+        }
 
         xhtml.endDocument();
     }
@@ -120,6 +123,7 @@ public class Mp3Parser implements Parser
        ID3v23Handler v23 = null;
        ID3v22Handler v22 = null;
        ID3v1Handler v1 = null;
+       LyricsHandler lyrics = null;
        AudioFrame firstAudio = null;
 
        // ID3v2 tags live at the start of the file
@@ -142,8 +146,10 @@ public class Mp3Parser implements Parser
        }
 
        // ID3v1 tags live at the end of the file
-       // Our handler handily seeks to the end for us
-       v1 = new ID3v1Handler(stream, handler);
+       // Lyrics live just before ID3v1, at the end of the file
+       // Search for both (handlers seek to the end for us)
+       lyrics = new LyricsHandler(stream, handler);
+       v1 = lyrics.id3v1;
 
        // Go in order of preference
        // Currently, that's newest to oldest
@@ -164,6 +170,7 @@ public class Mp3Parser implements Parser
        
        ID3TagsAndAudio ret = new ID3TagsAndAudio();
        ret.audio = firstAudio;
+       ret.lyrics = lyrics;
        ret.tags = tags.toArray(new ID3Tags[tags.size()]);
        return ret;
     }
@@ -171,6 +178,7 @@ public class Mp3Parser implements Parser
     protected static class ID3TagsAndAudio {
         private ID3Tags[] tags;
         private AudioFrame audio;
+        private LyricsHandler lyrics;
     }
 
 }

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java?rev=958659&r1=958658&r2=958659&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java Mon Jun 28 17:52:17 2010
@@ -133,6 +133,44 @@ public class Mp3ParserTest extends TestC
         assertEquals("2", metadata.get("channels"));
     }
 
+    /**
+     * Tests that a file with both lyrics and
+     *  ID3v2 tags gets both extracted correctly
+     */
+    public void testMp3ParsingLyrics() throws Exception {
+        Parser parser = new AutoDetectParser(); // Should auto-detect!
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        // Note - our test file has a lyrics tag, but lacks any
+        //  lyrics in the tags, so we can't test that bit
+        // TODO Find a better sample file
+        
+        InputStream stream = Mp3ParserTest.class.getResourceAsStream(
+                "/test-documents/testMP3lyrics.mp3");
+        try {
+            parser.parse(stream, handler, metadata);
+        } finally {
+            stream.close();
+        }
+
+        assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("Test Title", metadata.get(Metadata.TITLE));
+        assertEquals("Test Artist", metadata.get(Metadata.AUTHOR));
+
+        String content = handler.toString();
+        assertTrue(content.contains("Test Title"));
+        assertTrue(content.contains("Test Artist"));
+        assertTrue(content.contains("Test Album"));
+        assertTrue(content.contains("2008"));
+        assertTrue(content.contains("Test Comment"));
+        assertTrue(content.contains("Rock"));
+        
+        assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
+        assertEquals("44100", metadata.get("samplerate"));
+        assertEquals("2", metadata.get("channels"));
+    }
+    
     public void testID3v2Frame() throws Exception {
        byte[] empty = new byte[] {
              0x49, 0x44, 0x33, 3, 1, 0,

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testMP3lyrics.mp3
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testMP3lyrics.mp3?rev=958659&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testMP3lyrics.mp3
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream