You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2008/09/14 13:16:43 UTC

svn commit: r695191 - in /incubator/tika/trunk: ./ src/main/java/org/apache/tika/parser/mp3/ src/test/java/org/apache/tika/parser/mp3/

Author: jukka
Date: Sun Sep 14 04:16:43 2008
New Revision: 695191

URL: http://svn.apache.org/viewvc?rev=695191&view=rev
Log:
TIKA-120: Add support for retrieving ID3 tags from MP3 files

Inline tag parsing into Mp3Parser class to simplify the code.

Add a more specific test case.

Added:
    incubator/tika/trunk/src/test/java/org/apache/tika/parser/mp3/
    incubator/tika/trunk/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java
Removed:
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/mp3/ID3v1Tag.java
Modified:
    incubator/tika/trunk/CHANGES.txt
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java

Modified: incubator/tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/CHANGES.txt?rev=695191&r1=695190&r2=695191&view=diff
==============================================================================
--- incubator/tika/trunk/CHANGES.txt (original)
+++ incubator/tika/trunk/CHANGES.txt Sun Sep 14 04:16:43 2008
@@ -75,6 +75,9 @@
 
 32. TIKA-108 - New Tika logos (Yongqian Li & Jukka Zitting)
 
+33. TIKA-120 - Add support for retrieving ID3 tags from MP3 files
+               (Dave Meikle & Jukka Zitting)
+
 Release 0.1-incubating - 12/27/2007
 
 1. TIKA-5 - Port Metadata Framework from Nutch (mattmann)

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java?rev=695191&r1=695190&r2=695191&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java Sun Sep 14 04:16:43 2008
@@ -16,60 +16,222 @@
  */
 package org.apache.tika.parser.mp3;
 
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.UnsupportedEncodingException;
+
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.AbstractParser;
 import org.apache.tika.sax.XHTMLContentHandler;
-import org.apache.commons.lang.StringUtils;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
-import java.io.IOException;
-import java.io.InputStream;
-
 /**
- * <p>
  * The <code>Mp3Parser</code> is used to parse ID3 Version 1 Tag information
  * from an MP3 file, if available.
- * </p>
+ *
+ * @see http://www.id3.org/ID3v1
  */
 public class Mp3Parser extends AbstractParser {
 
+    /**
+     * List of predefined genres.
+     *
+     * @see http://www.id3.org/id3v2-00
+     */
+    private static final String[] GENRES = new String[] {
+        /*  0 */ "Blues",
+        /*  1 */ "Classic Rock",
+        /*  2 */ "Country",
+        /*  3 */ "Dance",
+        /*  4 */ "Disco",
+        /*  5 */ "Funk",
+        /*  6 */ "Grunge",
+        /*  7 */ "Hip-Hop",
+        /*  8 */ "Jazz",
+        /*  9 */ "Metal",
+        /* 10 */ "New Age",
+        /* 11 */ "Oldies",
+        /* 12 */ "Other",
+        /* 13 */ "Pop",
+        /* 14 */ "R&B",
+        /* 15 */ "Rap",
+        /* 16 */ "Reggae",
+        /* 17 */ "Rock",
+        /* 18 */ "Techno",
+        /* 19 */ "Industrial",
+        /* 20 */ "Alternative",
+        /* 21 */ "Ska",
+        /* 22 */ "Death Metal",
+        /* 23 */ "Pranks",
+        /* 24 */ "Soundtrack",
+        /* 25 */ "Euro-Techno",
+        /* 26 */ "Ambient",
+        /* 27 */ "Trip-Hop",
+        /* 28 */ "Vocal",
+        /* 29 */ "Jazz+Funk",
+        /* 30 */ "Fusion",
+        /* 31 */ "Trance",
+        /* 32 */ "Classical",
+        /* 33 */ "Instrumental",
+        /* 34 */ "Acid",
+        /* 35 */ "House",
+        /* 36 */ "Game",
+        /* 37 */ "Sound Clip",
+        /* 38 */ "Gospel",
+        /* 39 */ "Noise",
+        /* 40 */ "AlternRock",
+        /* 41 */ "Bass",
+        /* 42 */ "Soul",
+        /* 43 */ "Punk",
+        /* 44 */ "Space",
+        /* 45 */ "Meditative",
+        /* 46 */ "Instrumental Pop",
+        /* 47 */ "Instrumental Rock",
+        /* 48 */ "Ethnic",
+        /* 49 */ "Gothic",
+        /* 50 */ "Darkwave",
+        /* 51 */ "Techno-Industrial",
+        /* 52 */ "Electronic",
+        /* 53 */ "Pop-Folk",
+        /* 54 */ "Eurodance",
+        /* 55 */ "Dream",
+        /* 56 */ "Southern Rock",
+        /* 57 */ "Comedy",
+        /* 58 */ "Cult",
+        /* 59 */ "Gangsta",
+        /* 60 */ "Top 40",
+        /* 61 */ "Christian Rap",
+        /* 62 */ "Pop/Funk",
+        /* 63 */ "Jungle",
+        /* 64 */ "Native American",
+        /* 65 */ "Cabaret",
+        /* 66 */ "New Wave",
+        /* 67 */ "Psychadelic",
+        /* 68 */ "Rave",
+        /* 69 */ "Showtunes",
+        /* 70 */ "Trailer",
+        /* 71 */ "Lo-Fi",
+        /* 72 */ "Tribal",
+        /* 73 */ "Acid Punk",
+        /* 74 */ "Acid Jazz",
+        /* 75 */ "Polka",
+        /* 76 */ "Retro",
+        /* 77 */ "Musical",
+        /* 78 */ "Rock & Roll",
+        /* 79 */ "Hard Rock",
+        /* sentinel */ ""
+    };
+
     public void parse(
             InputStream stream, ContentHandler handler, Metadata metadata)
             throws IOException, SAXException, TikaException {
         metadata.set(Metadata.CONTENT_TYPE, "audio/mpeg");
 
-        ID3v1Tag tag = ID3v1Tag.createID3v1Tag(stream);
         XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
         xhtml.startDocument();
-        if (null != tag) {
-            if (StringUtils.isNotEmpty(tag.getTitle())) {
-                xhtml.element("p", tag.getTitle());
-                xhtml.characters("\n");
-                metadata.set(Metadata.TITLE, tag.getTitle());
-            }
-            if (StringUtils.isNotEmpty(tag.getArtist())) {
-                xhtml.element("p", tag.getArtist());
-                xhtml.characters("\n");
-            }
-            if (StringUtils.isNotEmpty(tag.getAlbum())) {
-                xhtml.element("p", tag.getAlbum());
-                xhtml.characters("\n");
-            }
-            if (StringUtils.isNotEmpty(tag.getYear())) {
-                xhtml.element("p", tag.getYear());
-                xhtml.characters("\n");
-            }
-            if (StringUtils.isNotEmpty(tag.getComment())) {
-                xhtml.element("p", tag.getComment());
-                xhtml.characters("\n");
-                metadata.set(Metadata.COMMENTS, tag.getComment());
-            }
-            if (StringUtils.isNotEmpty(tag.getGenreAsString())) {
-                xhtml.element("p", tag.getGenreAsString());
-                xhtml.endDocument();
-            }
+
+        byte[] buffer = getSuffix(stream, 128);
+        if (buffer.length == 128
+               && buffer[0] == 'T' && buffer[1] == 'A' && buffer[2] == 'G') {
+            String title = getString(buffer, 3, 33);
+            String artist = getString(buffer, 33, 63);
+            String album = getString(buffer, 63, 93);
+            String year = getString(buffer, 93, 97);
+            String comment = getString(buffer, 97, 127);
+            int genre = (int) buffer[127] & 0xff; // unsigned byte
+
+            metadata.set(Metadata.TITLE, title);
+            metadata.set(Metadata.AUTHOR, artist);
+
+            xhtml.element("h1", title);
+            xhtml.characters("\n");
+            xhtml.element("p", artist);
+            xhtml.characters("\n");
+            xhtml.element("p", album);
+            xhtml.characters("\n");
+            xhtml.element("p", year);
+            xhtml.characters("\n");
+            xhtml.element("p", comment);
+            xhtml.characters("\n");
+            xhtml.element("p", GENRES[Math.min(genre, GENRES.length - 1)]);
+            xhtml.characters("\n");
         }
+
+        xhtml.endDocument();
     }
+
+    /**
+     * Returns the identified ISO-8859-1 substring from the given byte buffer.
+     * The return value is the zero-terminated substring retrieved from
+     * between the given start and end positions in the given byte buffer.
+     * Extra whitespace (and control characters) from the beginning and the
+     * end of the substring is removed.
+     *
+     * @param buffer byte buffer
+     * @param start start index of the substring
+     * @param end end index of the substring
+     * @return the identified substring
+     * @throws TikaException if the ISO-8859-1 encoding is not available
+     */
+    private static String getString(byte[] buffer, int start, int end)
+            throws TikaException {
+        // Find the zero byte that marks the end of the string
+        int zero = start;
+        while (zero < end && buffer[zero] != 0) {
+            zero++;
+        }
+
+        // Skip trailing whitespace
+        end = zero;
+        while (start < end && buffer[end - 1] <= ' ') {
+            end--;
+        }
+
+        // Skip leading whitespace
+        while (start < end && buffer[start] <= ' ') {
+            start++;
+        }
+
+        // Return the remaining substring
+        try {
+            return new String(buffer, start, end - start, "ISO-8859-1");
+        } catch (UnsupportedEncodingException e) {
+            throw new TikaException("ISO-8859-1 encoding is not available", e);
+        }
+    }
+
+    /**
+     * Reads and returns the last <code>length</code> bytes from the
+     * given stream.
+     * @param stream input stream
+     * @param length number of bytes from the end to read and return
+     * @return stream the <code>InputStream</code> to read from.
+     * @throws IOException if the stream could not be read from.
+     */
+   private static byte[] getSuffix(InputStream stream, int length)
+           throws IOException {
+       byte[] buffer = new byte[2 * length];
+       int bytesInBuffer = 0;
+
+       int n = stream.read(buffer);
+       while (n != -1) {
+           bytesInBuffer += n;
+           if (bytesInBuffer == buffer.length) {
+               System.arraycopy(buffer, bytesInBuffer - length, buffer, 0, length);
+               bytesInBuffer = length;
+           }
+           n = stream.read(buffer, bytesInBuffer, buffer.length - bytesInBuffer);
+       }
+
+       if (bytesInBuffer < length) {
+           length = bytesInBuffer;
+       }
+
+       byte[] result = new byte[length];
+       System.arraycopy(buffer, bytesInBuffer - length, result, 0, length);
+       return result;
+   }
+
 }

Added: incubator/tika/trunk/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java?rev=695191&view=auto
==============================================================================
--- incubator/tika/trunk/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java (added)
+++ incubator/tika/trunk/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java Sun Sep 14 04:16:43 2008
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.io.InputStream;
+
+import junit.framework.TestCase;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Test case for parsing mp3 files.
+ */
+public class Mp3ParserTest extends TestCase {
+
+    public void testMp3Parsing() throws Exception {
+        Parser parser = new AutoDetectParser(); // Should auto-detect!
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+        metadata.set(Metadata.RESOURCE_NAME_KEY, "testMP3.mp3");
+
+        InputStream stream = Mp3ParserTest.class.getResourceAsStream(
+                "/test-documents/testMP3.mp3");
+        try {
+            parser.parse(stream, handler, metadata);
+        } finally {
+            stream.close();
+        }
+
+        assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("Test Title", metadata.get(Metadata.TITLE));
+        assertEquals("Test Artist", metadata.get(Metadata.AUTHOR));
+
+        String content = handler.toString();
+        assertTrue(content.contains("Test Title"));
+        assertTrue(content.contains("Test Artist"));
+        assertTrue(content.contains("Test Album"));
+        assertTrue(content.contains("2008"));
+        assertTrue(content.contains("Test Comment"));
+        assertTrue(content.contains("Rock"));
+    }
+
+}