You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2010/06/28 19:52:17 UTC
svn commit: r958659 - in /tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/mp3/ test/java/org/apache/tika/parser/mp3/
test/resources/test-documents/
Author: nick
Date: Mon Jun 28 17:52:17 2010
New Revision: 958659
URL: http://svn.apache.org/viewvc?rev=958659&view=rev
Log:
MP3 Lyrics text extraction support
Updates the MP3 parser to detect a LyricsV3 block before the ID3v1 tags block. If found, the lyrics text will be captured and output.
Added:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/LyricsHandler.java (with props)
tika/trunk/tika-parsers/src/test/resources/test-documents/testMP3lyrics.mp3 (with props)
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java?rev=958659&r1=958658&r2=958659&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java Mon Jun 28 17:52:17 2010
@@ -43,7 +43,7 @@ public class ID3v1Handler implements ID3
public ID3v1Handler(InputStream stream, ContentHandler handler)
throws IOException, SAXException, TikaException {
- this(getSuffix(stream, 128));
+ this(LyricsHandler.getSuffix(stream, 128));
}
/**
@@ -155,37 +155,4 @@ public class ID3v1Handler implements ID3
throw new TikaException("ISO-8859-1 encoding is not available", e);
}
}
-
- /**
- * Reads and returns the last <code>length</code> bytes from the
- * given stream.
- * @param stream input stream
- * @param length number of bytes from the end to read and return
- * @return stream the <code>InputStream</code> to read from.
- * @throws IOException if the stream could not be read from.
- */
- private static byte[] getSuffix(InputStream stream, int length)
- throws IOException {
- byte[] buffer = new byte[2 * length];
- int bytesInBuffer = 0;
-
- int n = stream.read(buffer);
- while (n != -1) {
- bytesInBuffer += n;
- if (bytesInBuffer == buffer.length) {
- System.arraycopy(buffer, bytesInBuffer - length, buffer, 0, length);
- bytesInBuffer = length;
- }
- n = stream.read(buffer, bytesInBuffer, buffer.length - bytesInBuffer);
- }
-
- if (bytesInBuffer < length) {
- length = bytesInBuffer;
- }
-
- byte[] result = new byte[length];
- System.arraycopy(buffer, bytesInBuffer - length, result, 0, length);
- return result;
- }
-
}
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java?rev=958659&r1=958658&r2=958659&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java Mon Jun 28 17:52:17 2010
@@ -86,7 +86,7 @@ public class ID3v2Frame implements MP3Fr
if (AudioFrame.isAudioHeader(h1, h2, h3, h4)) {
return new AudioFrame(h1, h2, h3, h4, inp);
}
-
+
// Not a frame header
return null;
}
Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/LyricsHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/LyricsHandler.java?rev=958659&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/LyricsHandler.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/LyricsHandler.java Mon Jun 28 17:52:17 2010
@@ -0,0 +1,146 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.exception.TikaException;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * This is used to parse Lyrics3 tag information
+ * from an MP3 file, if available.
+ * Handles lyrics tags of up to 10kb in size.
+ * Will process any ID3v1 tag data if present.
+ * Ignores extended ID3v1 data in the lyrics block
+ *
+ * @see <a href="http://www.id3.org/Lyrics3v2">Lyrics3 v2.0 specification</a>
+ */
+public class LyricsHandler {
+ boolean foundLyrics = false;
+ String lyricsText = null;
+ ID3v1Handler id3v1 = null;
+
+ public LyricsHandler(InputStream stream, ContentHandler handler)
+ throws IOException, SAXException, TikaException {
+ this(getSuffix(stream, 10240+128));
+ }
+
+ /**
+ * Looks for the Lyrics data, which will be
+ * just before the ID3v1 data (if present),
+ * and process it.
+ * Also sets things up for the ID3v1
+ * processing if required.
+ * Creates from the last 128 bytes of a stream.
+ */
+ protected LyricsHandler(byte[] tagData)
+ throws IOException, SAXException, TikaException {
+ if(tagData.length < 128) {
+ return;
+ }
+
+ // Is there ID3v1 data?
+ byte[] last128 = new byte[128];
+ System.arraycopy(tagData, tagData.length-128, last128, 0, 128);
+ id3v1 = new ID3v1Handler(last128);
+
+ if(tagData.length < 137) {
+ return;
+ }
+
+ // Are there lyrics?
+ int lookat = tagData.length - 9;
+ if(id3v1.found) {
+ lookat -= 128;
+ }
+ if(tagData[lookat+0] == 'L' && tagData[lookat+1] == 'Y' &&
+ tagData[lookat+2] == 'R' && tagData[lookat+3] == 'I' &&
+ tagData[lookat+4] == 'C' && tagData[lookat+5] == 'S' &&
+ tagData[lookat+6] == '2' && tagData[lookat+7] == '0' &&
+ tagData[lookat+8] == '0') {
+ foundLyrics = true;
+
+ int length = Integer.parseInt(
+ new String(tagData, lookat-6, 6)
+ );
+
+ String lyrics = new String(
+ tagData, lookat-length+5, length-11,
+ "ASCII"
+ );
+
+ int pos = 0;
+ while(pos < lyrics.length()-9) {
+ String tagName = lyrics.substring(pos, pos+3);
+ int tagLen = Integer.parseInt(
+ lyrics.substring(pos+3, pos+9)
+ );
+
+ if(tagName.equals("LYR")) {
+ lyricsText = lyrics.substring(pos+9, pos+9+tagLen);
+ }
+
+ pos += tagLen;
+ }
+ }
+ }
+
+ public boolean hasID3v1() {
+ if(id3v1 == null || id3v1.found == false) {
+ return false;
+ }
+ return true;
+ }
+ public boolean hasLyrics() {
+ return lyricsText != null && lyricsText.length() > 0;
+ }
+
+ /**
+ * Reads and returns the last <code>length</code> bytes from the
+ * given stream.
+ * @param stream input stream
+ * @param length number of bytes from the end to read and return
+ * @return stream the <code>InputStream</code> to read from.
+ * @throws IOException if the stream could not be read from.
+ */
+ protected static byte[] getSuffix(InputStream stream, int length)
+ throws IOException {
+ byte[] buffer = new byte[2 * length];
+ int bytesInBuffer = 0;
+
+ int n = stream.read(buffer);
+ while (n != -1) {
+ bytesInBuffer += n;
+ if (bytesInBuffer == buffer.length) {
+ System.arraycopy(buffer, bytesInBuffer - length, buffer, 0, length);
+ bytesInBuffer = length;
+ }
+ n = stream.read(buffer, bytesInBuffer, buffer.length - bytesInBuffer);
+ }
+
+ if (bytesInBuffer < length) {
+ length = bytesInBuffer;
+ }
+
+ byte[] result = new byte[length];
+ System.arraycopy(buffer, bytesInBuffer - length, result, 0, length);
+ return result;
+ }
+}
Propchange: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/LyricsHandler.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java?rev=958659&r1=958658&r2=958659&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java Mon Jun 28 17:52:17 2010
@@ -97,6 +97,9 @@ public class Mp3Parser implements Parser
XMPDM.AUDIO_SAMPLE_RATE,
Integer.toString(audioAndTags.audio.getSampleRate()));
}
+ if (audioAndTags.lyrics != null && audioAndTags.lyrics.hasLyrics()) {
+ xhtml.element("p", audioAndTags.lyrics.lyricsText);
+ }
xhtml.endDocument();
}
@@ -120,6 +123,7 @@ public class Mp3Parser implements Parser
ID3v23Handler v23 = null;
ID3v22Handler v22 = null;
ID3v1Handler v1 = null;
+ LyricsHandler lyrics = null;
AudioFrame firstAudio = null;
// ID3v2 tags live at the start of the file
@@ -142,8 +146,10 @@ public class Mp3Parser implements Parser
}
// ID3v1 tags live at the end of the file
- // Our handler handily seeks to the end for us
- v1 = new ID3v1Handler(stream, handler);
+ // Lyrics live just before ID3v1, at the end of the file
+ // Search for both (handlers seek to the end for us)
+ lyrics = new LyricsHandler(stream, handler);
+ v1 = lyrics.id3v1;
// Go in order of preference
// Currently, that's newest to oldest
@@ -164,6 +170,7 @@ public class Mp3Parser implements Parser
ID3TagsAndAudio ret = new ID3TagsAndAudio();
ret.audio = firstAudio;
+ ret.lyrics = lyrics;
ret.tags = tags.toArray(new ID3Tags[tags.size()]);
return ret;
}
@@ -171,6 +178,7 @@ public class Mp3Parser implements Parser
protected static class ID3TagsAndAudio {
private ID3Tags[] tags;
private AudioFrame audio;
+ private LyricsHandler lyrics;
}
}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java?rev=958659&r1=958658&r2=958659&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java Mon Jun 28 17:52:17 2010
@@ -133,6 +133,44 @@ public class Mp3ParserTest extends TestC
assertEquals("2", metadata.get("channels"));
}
+ /**
+ * Tests that a file with both lyrics and
+ * ID3v2 tags gets both extracted correctly
+ */
+ public void testMp3ParsingLyrics() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ // Note - our test file has a lyrics tag, but lacks any
+ // lyrics in the tags, so we can't test that bit
+ // TODO Find a better sample file
+
+ InputStream stream = Mp3ParserTest.class.getResourceAsStream(
+ "/test-documents/testMP3lyrics.mp3");
+ try {
+ parser.parse(stream, handler, metadata);
+ } finally {
+ stream.close();
+ }
+
+ assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("Test Title", metadata.get(Metadata.TITLE));
+ assertEquals("Test Artist", metadata.get(Metadata.AUTHOR));
+
+ String content = handler.toString();
+ assertTrue(content.contains("Test Title"));
+ assertTrue(content.contains("Test Artist"));
+ assertTrue(content.contains("Test Album"));
+ assertTrue(content.contains("2008"));
+ assertTrue(content.contains("Test Comment"));
+ assertTrue(content.contains("Rock"));
+
+ assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
+ assertEquals("44100", metadata.get("samplerate"));
+ assertEquals("2", metadata.get("channels"));
+ }
+
public void testID3v2Frame() throws Exception {
byte[] empty = new byte[] {
0x49, 0x44, 0x33, 3, 1, 0,
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testMP3lyrics.mp3
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testMP3lyrics.mp3?rev=958659&view=auto
==============================================================================
Binary file - no diff available.
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testMP3lyrics.mp3
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream