You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2008/09/14 11:27:39 UTC

svn commit: r695159 - in /incubator/tika/trunk/src: main/java/org/apache/tika/parser/mp3/ main/resources/ main/resources/mime/ test/java/org/apache/tika/

Author: jukka
Date: Sun Sep 14 02:27:39 2008
New Revision: 695159

URL: http://svn.apache.org/viewvc?rev=695159&view=rev
Log:
TIKA-120: Add support for retrieving ID3 tags from MP3 files

Patch by Dave Meikle.

Added:
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/mp3/
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/mp3/ID3v1Tag.java
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java
Modified:
    incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml
    incubator/tika/trunk/src/main/resources/tika-config.xml
    incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java

Added: incubator/tika/trunk/src/main/java/org/apache/tika/parser/mp3/ID3v1Tag.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/mp3/ID3v1Tag.java?rev=695159&view=auto
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/mp3/ID3v1Tag.java (added)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/mp3/ID3v1Tag.java Sun Sep 14 02:27:39 2008
@@ -0,0 +1,240 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import org.apache.commons.lang.StringUtils;
+
+import java.io.BufferedInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.FileInputStream;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Arrays;
+
+/**
+ * <p>
+ * This class parses and represents a ID3v1 Tag. Implemented based on http://www.id3.org/ID3v1.
+ * </p>
+ */
+public class ID3v1Tag {
+    /**
+     * Static Map of genre codes.
+     */
+    private static Map genres = new HashMap();
+
+    static {
+        genres.put(0, "Blues");
+        genres.put(1, "Classic Rock");
+        genres.put(2, "Country");
+        genres.put(3, "Dance");
+        genres.put(4, "Disco");
+        genres.put(5, "Funk");
+        genres.put(6, "Grunge");
+        genres.put(7, "Hip-Hop");
+        genres.put(8, "Jazz");
+        genres.put(9, "Metal");
+        genres.put(10, "New Age");
+        genres.put(11, "Oldies");
+        genres.put(12, "Other");
+        genres.put(13, "Pop");
+        genres.put(14, "R&B");
+        genres.put(15, "Rap");
+        genres.put(16, "Reggae");
+        genres.put(17, "Rock");
+        genres.put(18, "Techno");
+        genres.put(19, "Industrial");
+        genres.put(20, "Alternative");
+        genres.put(21, "Ska");
+        genres.put(22, "Death Metal");
+        genres.put(23, "Pranks");
+        genres.put(24, "Soundtrack");
+        genres.put(25, "Euro-Techno");
+        genres.put(26, "Ambient");
+        genres.put(27, "Trip-Hop");
+        genres.put(28, "Vocal");
+        genres.put(29, "Jazz+Funk");
+        genres.put(30, "Fusion");
+        genres.put(31, "Trance");
+        genres.put(32, "Classical");
+        genres.put(33, "Instrumental");
+        genres.put(34, "Acid");
+        genres.put(35, "House");
+        genres.put(36, "Game");
+        genres.put(37, "Sound Clip");
+        genres.put(38, "Gospel");
+        genres.put(39, "Noise");
+        genres.put(40, "AlternRock");
+        genres.put(41, "Bass");
+        genres.put(42, "Soul");
+        genres.put(43, "Punk");
+        genres.put(44, "Space");
+        genres.put(45, "Meditative");
+        genres.put(46, "Instrumental Pop");
+        genres.put(47, "Instrumental Rock");
+        genres.put(48, "Ethnic");
+        genres.put(49, "Gothic");
+        genres.put(50, "Darkwave");
+        genres.put(51, "Techno-Industrial");
+        genres.put(52, "Electronic");
+        genres.put(53, "Pop-Folk");
+        genres.put(54, "Eurodance");
+        genres.put(55, "Dream");
+        genres.put(56, "Southern Rock");
+        genres.put(57, "Comedy");
+        genres.put(58, "Cult");
+        genres.put(59, "Gangsta");
+        genres.put(60, "Top 40");
+        genres.put(61, "Christian Rap");
+        genres.put(62, "Pop/Funk");
+        genres.put(63, "Jungle");
+        genres.put(64, "Native American");
+        genres.put(65, "Cabaret");
+        genres.put(66, "New Wave");
+        genres.put(67, "Psychadelic");
+        genres.put(68, "Rave");
+        genres.put(69, "Showtunes");
+        genres.put(70, "Trailer");
+        genres.put(71, "Lo-Fi");
+        genres.put(72, "Tribal");
+        genres.put(73, "Acid Punk");
+        genres.put(74, "Acid Jazz");
+        genres.put(75, "Polka");
+        genres.put(76, "Retro");
+        genres.put(77, "Musical");
+        genres.put(78, "Rock & Roll");
+        genres.put(79, "Hard Rock");
+    }
+
+    private String title;
+    private String artist;
+    private String album;
+    private String year;
+    private String comment;
+    private int genre;
+
+    /**
+     * Default Private Contructor.
+     *
+     * @param title   the title.
+     * @param artist  the artist.
+     * @param album   the album.
+     * @param year    the year.
+     * @param comment the comment.
+     * @param genre   the genre code.
+     */
+    private ID3v1Tag(String title, String artist, String album,
+                     String year, String comment, int genre) {
+        this.title = title;
+        this.artist = artist;
+        this.album = album;
+        this.year = year;
+        this.comment = comment;
+        this.genre = genre;
+    }
+
+    public String getTitle() {
+        return title;
+    }
+
+    public String getArtist() {
+        return artist;
+    }
+
+    public String getAlbum() {
+        return album;
+    }
+
+    public String getYear() {
+        return year;
+    }
+
+    public String getComment() {
+        return comment;
+    }
+
+    public int getGenre() {
+        return genre;
+    }
+
+    public String getGenreAsString() {
+        return (String) genres.get(genre);
+    }
+
+    /**
+     * Create an <code>ID3v1Tag</code> from an <code>InputStream</code>.
+     *
+     * @param stream the <code>InputStream</code> to parse.
+     * @return a <code>ID3v1Tag</code> if ID3 v1 information is available, null otherwise.
+     */
+    public static ID3v1Tag createID3v1Tag(InputStream stream) {
+        byte[] buffer;
+        try {
+             buffer = getSuffix(stream, 128);
+        } catch (IOException ex) {
+            return null;
+        }
+
+        // We have read what we think is the tag, first check and if ok extract values
+        String tag = new String(buffer, 0, 128);
+        if (!StringUtils.equals(StringUtils.substring(tag, 0, 3), "TAG")) {
+            return null;
+        }
+        String title = StringUtils.substring(tag, 3, 33).trim();
+        String artist = StringUtils.substring(tag, 33, 63).trim();
+        String album = StringUtils.substring(tag, 63, 93).trim();
+        String year = StringUtils.substring(tag, 93, 97).trim();
+        String comment = StringUtils.substring(tag, 97, 127).trim();
+        int genre = new Byte((byte) tag.charAt(127)).intValue();
+
+        // Return new ID3v1Tag instance.
+        return new ID3v1Tag(title, artist, album, year, comment, genre);
+    }
+
+    /**
+     * Reads and returns the last <code>length</code> bytes from the
+     * given stream.
+     * @param stream input stream
+     * @param length number of bytes from the end to read and return
+     * @return stream the <code>InputStream</code> to read from.
+     * @throws IOException if the stream could not be read from.
+     */
+   private static byte[] getSuffix(InputStream stream, int length) throws IOException {
+       byte[] buffer = new byte[2 * length];
+       int bytesInBuffer = 0;
+
+       int n = stream.read(buffer);
+       while (n != -1) {
+           bytesInBuffer += n;
+           if (bytesInBuffer == buffer.length) {
+               System.arraycopy(buffer, bytesInBuffer - length, buffer, 0, length);
+               bytesInBuffer = length;
+           }
+           n = stream.read(buffer, bytesInBuffer, buffer.length - bytesInBuffer);
+       }
+
+       if (bytesInBuffer < length) {
+           length = bytesInBuffer;
+       }
+
+       byte[] result = new byte[length];
+       System.arraycopy(buffer, bytesInBuffer - length, result, 0, length);
+       return result;
+   }
+
+}

Added: incubator/tika/trunk/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java?rev=695159&view=auto
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java (added)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java Sun Sep 14 02:27:39 2008
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.commons.lang.StringUtils;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+/**
+ * <p>
+ * The <code>Mp3Parser</code> is used to parse ID3 Version 1 Tag information
+ * from an MP3 file, if available.
+ * </p>
+ */
+public class Mp3Parser extends AbstractParser {
+
+    public void parse(
+            InputStream stream, ContentHandler handler, Metadata metadata)
+            throws IOException, SAXException, TikaException {
+        metadata.set(Metadata.CONTENT_TYPE, "audio/mpeg");
+
+        ID3v1Tag tag = ID3v1Tag.createID3v1Tag(stream);
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+        if (null != tag) {
+            if (StringUtils.isNotEmpty(tag.getTitle())) {
+                xhtml.element("p", tag.getTitle());
+                xhtml.characters("\n");
+                metadata.set(Metadata.TITLE, tag.getTitle());
+            }
+            if (StringUtils.isNotEmpty(tag.getArtist())) {
+                xhtml.element("p", tag.getArtist());
+                xhtml.characters("\n");
+            }
+            if (StringUtils.isNotEmpty(tag.getAlbum())) {
+                xhtml.element("p", tag.getAlbum());
+                xhtml.characters("\n");
+            }
+            if (StringUtils.isNotEmpty(tag.getYear())) {
+                xhtml.element("p", tag.getYear());
+                xhtml.characters("\n");
+            }
+            if (StringUtils.isNotEmpty(tag.getComment())) {
+                xhtml.element("p", tag.getComment());
+                xhtml.characters("\n");
+                metadata.set(Metadata.COMMENTS, tag.getComment());
+            }
+            if (StringUtils.isNotEmpty(tag.getGenreAsString())) {
+                xhtml.element("p", tag.getGenreAsString());
+                xhtml.endDocument();
+            }
+        }
+    }
+}

Modified: incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml?rev=695159&r1=695158&r2=695159&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml (original)
+++ incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml Sun Sep 14 02:27:39 2008
@@ -6,9 +6,9 @@
   The ASF licenses this file to You under the Apache License, Version 2.0
   (the "License"); you may not use this file except in compliance with
   the License.  You may obtain a copy of the License at
-  
+
   http://www.apache.org/licenses/LICENSE-2.0
-  
+
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -17,7 +17,7 @@
 -->
 <!--
   Description: This xml file defines the valid mime types used by Tika.
-  The mime types within this file are based on the types in the mime-types.xml 
+  The mime types within this file are based on the types in the mime-types.xml
   file available in Apache Nutch.
 -->
 <mime-info>
@@ -654,6 +654,10 @@
     <glob pattern="*.ice" />
   </mime-type>
 
+  <mime-type type="audio/mpeg">
+    <glob pattern="*.mp3" />
+  </mime-type>
+
   <!-- ===================================================================== -->
   <!-- TIKA-85: http://www.apache.org/dev/svn-eol-style.txt                  -->
   <!-- ===================================================================== -->

Modified: incubator/tika/trunk/src/main/resources/tika-config.xml
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/resources/tika-config.xml?rev=695159&r1=695158&r2=695159&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/resources/tika-config.xml (original)
+++ incubator/tika/trunk/src/main/resources/tika-config.xml Sun Sep 14 02:27:39 2008
@@ -125,6 +125,9 @@
                 <mime>application/x-tika-java-class</mime>
         </parser>
 
+        <parser name="parse-mp3" class="org.apache.tika.parser.mp3.Mp3Parser">
+                <mime>audio/mpeg</mime>
+        </parser>
     </parsers>
 
 </properties>
\ No newline at end of file

Modified: incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java?rev=695159&r1=695158&r2=695159&view=diff
==============================================================================
--- incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java (original)
+++ incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java Sun Sep 14 02:27:39 2008
@@ -181,6 +181,16 @@
         assertNotNull(parser);
     }
 
+    public void testMP3Extraction() throws Exception {
+        File file = getTestFile("testMP3.mp3");
+        String s1 = ParseUtils.getStringContent(file, tc);
+        String s2 = ParseUtils.getStringContent(file, tc, "audio/mpeg");
+        assertEquals(s1, s2);
+
+        Parser parser = tc.getParser("audio/mpeg");
+        assertNotNull(parser);
+    }
+
     public void testZipExtraction() throws Exception {
         File zip = getTestFile("test-documents.zip");
         List<Parser> parsers = ParseUtils.getParsersFromZip(zip, tc);