You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2008/09/14 13:16:43 UTC
svn commit: r695191 - in /incubator/tika/trunk: ./
src/main/java/org/apache/tika/parser/mp3/
src/test/java/org/apache/tika/parser/mp3/
Author: jukka
Date: Sun Sep 14 04:16:43 2008
New Revision: 695191
URL: http://svn.apache.org/viewvc?rev=695191&view=rev
Log:
TIKA-120: Add support for retrieving ID3 tags from MP3 files
Inline tag parsing into Mp3Parser class to simplify the code.
Add a more specific test case.
Added:
incubator/tika/trunk/src/test/java/org/apache/tika/parser/mp3/
incubator/tika/trunk/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java
Removed:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/mp3/ID3v1Tag.java
Modified:
incubator/tika/trunk/CHANGES.txt
incubator/tika/trunk/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java
Modified: incubator/tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=695191&r1=695190&r2=695191&view=diff
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Sun Sep 14 04:16:43 2008
@@ -75,6 +75,9 @@
32. TIKA-108 - New Tika logos (Yongqian Li & Jukka Zitting)
+33. TIKA-120 - Add support for retrieving ID3 tags from MP3 files
+ (Dave Meikle & Jukka Zitting)
+
Release 0.1-incubating - 12/27/2007
1. TIKA-5 - Port Metadata Framework from Nutch (mattmann)
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java?rev=695191&r1=695190&r2=695191&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java Sun Sep 14 04:16:43 2008
@@ -16,60 +16,222 @@
*/
package org.apache.tika.parser.mp3;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.UnsupportedEncodingException;
+
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.sax.XHTMLContentHandler;
-import org.apache.commons.lang.StringUtils;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
-import java.io.IOException;
-import java.io.InputStream;
-
/**
- * <p>
* The <code>Mp3Parser</code> is used to parse ID3 Version 1 Tag information
* from an MP3 file, if available.
- * </p>
+ *
+ * @see http://www.id3.org/ID3v1
*/
public class Mp3Parser extends AbstractParser {
+ /**
+ * List of predefined genres.
+ *
+ * @see http://www.id3.org/id3v2-00
+ */
+ private static final String[] GENRES = new String[] {
+ /* 0 */ "Blues",
+ /* 1 */ "Classic Rock",
+ /* 2 */ "Country",
+ /* 3 */ "Dance",
+ /* 4 */ "Disco",
+ /* 5 */ "Funk",
+ /* 6 */ "Grunge",
+ /* 7 */ "Hip-Hop",
+ /* 8 */ "Jazz",
+ /* 9 */ "Metal",
+ /* 10 */ "New Age",
+ /* 11 */ "Oldies",
+ /* 12 */ "Other",
+ /* 13 */ "Pop",
+ /* 14 */ "R&B",
+ /* 15 */ "Rap",
+ /* 16 */ "Reggae",
+ /* 17 */ "Rock",
+ /* 18 */ "Techno",
+ /* 19 */ "Industrial",
+ /* 20 */ "Alternative",
+ /* 21 */ "Ska",
+ /* 22 */ "Death Metal",
+ /* 23 */ "Pranks",
+ /* 24 */ "Soundtrack",
+ /* 25 */ "Euro-Techno",
+ /* 26 */ "Ambient",
+ /* 27 */ "Trip-Hop",
+ /* 28 */ "Vocal",
+ /* 29 */ "Jazz+Funk",
+ /* 30 */ "Fusion",
+ /* 31 */ "Trance",
+ /* 32 */ "Classical",
+ /* 33 */ "Instrumental",
+ /* 34 */ "Acid",
+ /* 35 */ "House",
+ /* 36 */ "Game",
+ /* 37 */ "Sound Clip",
+ /* 38 */ "Gospel",
+ /* 39 */ "Noise",
+ /* 40 */ "AlternRock",
+ /* 41 */ "Bass",
+ /* 42 */ "Soul",
+ /* 43 */ "Punk",
+ /* 44 */ "Space",
+ /* 45 */ "Meditative",
+ /* 46 */ "Instrumental Pop",
+ /* 47 */ "Instrumental Rock",
+ /* 48 */ "Ethnic",
+ /* 49 */ "Gothic",
+ /* 50 */ "Darkwave",
+ /* 51 */ "Techno-Industrial",
+ /* 52 */ "Electronic",
+ /* 53 */ "Pop-Folk",
+ /* 54 */ "Eurodance",
+ /* 55 */ "Dream",
+ /* 56 */ "Southern Rock",
+ /* 57 */ "Comedy",
+ /* 58 */ "Cult",
+ /* 59 */ "Gangsta",
+ /* 60 */ "Top 40",
+ /* 61 */ "Christian Rap",
+ /* 62 */ "Pop/Funk",
+ /* 63 */ "Jungle",
+ /* 64 */ "Native American",
+ /* 65 */ "Cabaret",
+ /* 66 */ "New Wave",
+ /* 67 */ "Psychadelic",
+ /* 68 */ "Rave",
+ /* 69 */ "Showtunes",
+ /* 70 */ "Trailer",
+ /* 71 */ "Lo-Fi",
+ /* 72 */ "Tribal",
+ /* 73 */ "Acid Punk",
+ /* 74 */ "Acid Jazz",
+ /* 75 */ "Polka",
+ /* 76 */ "Retro",
+ /* 77 */ "Musical",
+ /* 78 */ "Rock & Roll",
+ /* 79 */ "Hard Rock",
+ /* sentinel */ ""
+ };
+
public void parse(
InputStream stream, ContentHandler handler, Metadata metadata)
throws IOException, SAXException, TikaException {
metadata.set(Metadata.CONTENT_TYPE, "audio/mpeg");
- ID3v1Tag tag = ID3v1Tag.createID3v1Tag(stream);
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
- if (null != tag) {
- if (StringUtils.isNotEmpty(tag.getTitle())) {
- xhtml.element("p", tag.getTitle());
- xhtml.characters("\n");
- metadata.set(Metadata.TITLE, tag.getTitle());
- }
- if (StringUtils.isNotEmpty(tag.getArtist())) {
- xhtml.element("p", tag.getArtist());
- xhtml.characters("\n");
- }
- if (StringUtils.isNotEmpty(tag.getAlbum())) {
- xhtml.element("p", tag.getAlbum());
- xhtml.characters("\n");
- }
- if (StringUtils.isNotEmpty(tag.getYear())) {
- xhtml.element("p", tag.getYear());
- xhtml.characters("\n");
- }
- if (StringUtils.isNotEmpty(tag.getComment())) {
- xhtml.element("p", tag.getComment());
- xhtml.characters("\n");
- metadata.set(Metadata.COMMENTS, tag.getComment());
- }
- if (StringUtils.isNotEmpty(tag.getGenreAsString())) {
- xhtml.element("p", tag.getGenreAsString());
- xhtml.endDocument();
- }
+
+ byte[] buffer = getSuffix(stream, 128);
+ if (buffer.length == 128
+ && buffer[0] == 'T' && buffer[1] == 'A' && buffer[2] == 'G') {
+ String title = getString(buffer, 3, 33);
+ String artist = getString(buffer, 33, 63);
+ String album = getString(buffer, 63, 93);
+ String year = getString(buffer, 93, 97);
+ String comment = getString(buffer, 97, 127);
+ int genre = (int) buffer[127] & 0xff; // unsigned byte
+
+ metadata.set(Metadata.TITLE, title);
+ metadata.set(Metadata.AUTHOR, artist);
+
+ xhtml.element("h1", title);
+ xhtml.characters("\n");
+ xhtml.element("p", artist);
+ xhtml.characters("\n");
+ xhtml.element("p", album);
+ xhtml.characters("\n");
+ xhtml.element("p", year);
+ xhtml.characters("\n");
+ xhtml.element("p", comment);
+ xhtml.characters("\n");
+ xhtml.element("p", GENRES[Math.min(genre, GENRES.length - 1)]);
+ xhtml.characters("\n");
}
+
+ xhtml.endDocument();
}
+
+ /**
+ * Returns the identified ISO-8859-1 substring from the given byte buffer.
+ * The return value is the zero-terminated substring retrieved from
+ * between the given start and end positions in the given byte buffer.
+ * Extra whitespace (and control characters) from the beginning and the
+ * end of the substring is removed.
+ *
+ * @param buffer byte buffer
+ * @param start start index of the substring
+ * @param end end index of the substring
+ * @return the identified substring
+ * @throws TikaException if the ISO-8859-1 encoding is not available
+ */
+ private static String getString(byte[] buffer, int start, int end)
+ throws TikaException {
+ // Find the zero byte that marks the end of the string
+ int zero = start;
+ while (zero < end && buffer[zero] != 0) {
+ zero++;
+ }
+
+ // Skip trailing whitespace
+ end = zero;
+ while (start < end && buffer[end - 1] <= ' ') {
+ end--;
+ }
+
+ // Skip leading whitespace
+ while (start < end && buffer[start] <= ' ') {
+ start++;
+ }
+
+ // Return the remaining substring
+ try {
+ return new String(buffer, start, end - start, "ISO-8859-1");
+ } catch (UnsupportedEncodingException e) {
+ throw new TikaException("ISO-8859-1 encoding is not available", e);
+ }
+ }
+
+ /**
+ * Reads and returns the last <code>length</code> bytes from the
+ * given stream.
+ * @param stream input stream
+ * @param length number of bytes from the end to read and return
+ * @return stream the <code>InputStream</code> to read from.
+ * @throws IOException if the stream could not be read from.
+ */
+ private static byte[] getSuffix(InputStream stream, int length)
+ throws IOException {
+ byte[] buffer = new byte[2 * length];
+ int bytesInBuffer = 0;
+
+ int n = stream.read(buffer);
+ while (n != -1) {
+ bytesInBuffer += n;
+ if (bytesInBuffer == buffer.length) {
+ System.arraycopy(buffer, bytesInBuffer - length, buffer, 0, length);
+ bytesInBuffer = length;
+ }
+ n = stream.read(buffer, bytesInBuffer, buffer.length - bytesInBuffer);
+ }
+
+ if (bytesInBuffer < length) {
+ length = bytesInBuffer;
+ }
+
+ byte[] result = new byte[length];
+ System.arraycopy(buffer, bytesInBuffer - length, result, 0, length);
+ return result;
+ }
+
}
Added: incubator/tika/trunk/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java?rev=695191&view=auto
==============================================================================
--- incubator/tika/trunk/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java (added)
+++ incubator/tika/trunk/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java Sun Sep 14 04:16:43 2008
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.io.InputStream;
+
+import junit.framework.TestCase;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Test case for parsing mp3 files.
+ */
+public class Mp3ParserTest extends TestCase {
+
+ public void testMp3Parsing() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.RESOURCE_NAME_KEY, "testMP3.mp3");
+
+ InputStream stream = Mp3ParserTest.class.getResourceAsStream(
+ "/test-documents/testMP3.mp3");
+ try {
+ parser.parse(stream, handler, metadata);
+ } finally {
+ stream.close();
+ }
+
+ assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("Test Title", metadata.get(Metadata.TITLE));
+ assertEquals("Test Artist", metadata.get(Metadata.AUTHOR));
+
+ String content = handler.toString();
+ assertTrue(content.contains("Test Title"));
+ assertTrue(content.contains("Test Artist"));
+ assertTrue(content.contains("Test Album"));
+ assertTrue(content.contains("2008"));
+ assertTrue(content.contains("Test Comment"));
+ assertTrue(content.contains("Rock"));
+ }
+
+}