You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2010/01/26 12:27:34 UTC

svn commit: r903176 - in /lucene/tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/mp3/ test/java/org/apache/tika/ test/java/org/apache/tika/parser/mp3/ test/resources/test-documents/

Author: jukka
Date: Tue Jan 26 11:27:33 2010
New Revision: 903176

URL: http://svn.apache.org/viewvc?rev=903176&view=rev
Log:
TIKA-368: ID3v2 support for mp3 parser

Patch by Nick Burch

Added:
    lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/CompositeTagHandler.java
    lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3Tags.java
    lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java
    lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v22Handler.java
    lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v23Handler.java
    lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v24Handler.java
    lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java
    lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testMP3id3v1.mp3
      - copied unchanged from r903148, lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testMP3.mp3
    lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testMP3id3v1_v2.mp3   (with props)
    lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testMP3id3v2.mp3   (with props)
Removed:
    lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testMP3.mp3
Modified:
    lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java
    lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/TestParsers.java
    lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java

Added: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/CompositeTagHandler.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/CompositeTagHandler.java?rev=903176&view=auto
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/CompositeTagHandler.java (added)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/CompositeTagHandler.java Tue Jan 26 11:27:33 2010
@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+/**
+ * Takes an array of {@link ID3Tags} in preference order, and when asked for
+ * a given tag, will return it from the first {@link ID3Tags} that has it.
+ */
+public class CompositeTagHandler implements ID3Tags {
+
+    private ID3Tags[] tags;
+
+    public CompositeTagHandler(ID3Tags[] tags) {
+        this.tags = tags;
+    }
+
+    public boolean getTagsPresent() {
+        for (ID3Tags tag : tags) {
+            if (tag.getTagsPresent()) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    public String getTitle() {
+        for (ID3Tags tag : tags) {
+            if (tag.getTitle() != null) {
+                return tag.getTitle();
+            }
+        }
+        return null;
+    }
+
+    public String getArtist() {
+        for (ID3Tags tag : tags) {
+            if (tag.getArtist() != null) {
+                return tag.getArtist();
+            }
+        }
+        return null;
+    }
+
+    public String getAlbum() {
+        for (ID3Tags tag : tags) {
+            if (tag.getAlbum() != null) {
+                return tag.getAlbum();
+            }
+        }
+        return null;
+    }
+
+    public String getYear() {
+        for (ID3Tags tag : tags) {
+            if (tag.getYear() != null) {
+                return tag.getYear();
+            }
+        }
+        return null;
+    }
+
+    public String getComment() {
+        for (ID3Tags tag : tags) {
+            if (tag.getComment() != null) {
+                return tag.getComment();
+            }
+        }
+        return null;
+    }
+
+    public String getGenre() {
+        for (ID3Tags tag : tags) {
+            if (tag.getGenre() != null) {
+                return tag.getGenre();
+            }
+        }
+        return null;
+    }
+
+    public String getTrackNumber() {
+        for (ID3Tags tag : tags) {
+            if (tag.getTrackNumber() != null) {
+                return tag.getTrackNumber();
+            }
+        }
+        return null;
+    }
+
+}

Added: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3Tags.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3Tags.java?rev=903176&view=auto
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3Tags.java (added)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3Tags.java Tue Jan 26 11:27:33 2010
@@ -0,0 +1,185 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+
+/**
+ * Interface that defines the common interface for ID3 tag parsers,
+ *  such as ID3v1 and ID3v2.3.
+ * Implementations should return NULL if the file lacks a given
+ *  tag, or if the tag isn't defined for the version.
+ *  
+ * Note that so far, only the ID3v1 core tags are listed here. In
+ *  future, we may wish to add more to cover the extra tags that
+ *  our ID3v2 handlers can produce.
+ */
+public interface ID3Tags {
+    /**
+     * List of predefined genres.
+     *
+     * @see http://www.id3.org/id3v2-00
+     */
+    String[] GENRES = new String[] {
+        /*  0 */ "Blues",
+        /*  1 */ "Classic Rock",
+        /*  2 */ "Country",
+        /*  3 */ "Dance",
+        /*  4 */ "Disco",
+        /*  5 */ "Funk",
+        /*  6 */ "Grunge",
+        /*  7 */ "Hip-Hop",
+        /*  8 */ "Jazz",
+        /*  9 */ "Metal",
+        /* 10 */ "New Age",
+        /* 11 */ "Oldies",
+        /* 12 */ "Other",
+        /* 13 */ "Pop",
+        /* 14 */ "R&B",
+        /* 15 */ "Rap",
+        /* 16 */ "Reggae",
+        /* 17 */ "Rock",
+        /* 18 */ "Techno",
+        /* 19 */ "Industrial",
+        /* 20 */ "Alternative",
+        /* 21 */ "Ska",
+        /* 22 */ "Death Metal",
+        /* 23 */ "Pranks",
+        /* 24 */ "Soundtrack",
+        /* 25 */ "Euro-Techno",
+        /* 26 */ "Ambient",
+        /* 27 */ "Trip-Hop",
+        /* 28 */ "Vocal",
+        /* 29 */ "Jazz+Funk",
+        /* 30 */ "Fusion",
+        /* 31 */ "Trance",
+        /* 32 */ "Classical",
+        /* 33 */ "Instrumental",
+        /* 34 */ "Acid",
+        /* 35 */ "House",
+        /* 36 */ "Game",
+        /* 37 */ "Sound Clip",
+        /* 38 */ "Gospel",
+        /* 39 */ "Noise",
+        /* 40 */ "AlternRock",
+        /* 41 */ "Bass",
+        /* 42 */ "Soul",
+        /* 43 */ "Punk",
+        /* 44 */ "Space",
+        /* 45 */ "Meditative",
+        /* 46 */ "Instrumental Pop",
+        /* 47 */ "Instrumental Rock",
+        /* 48 */ "Ethnic",
+        /* 49 */ "Gothic",
+        /* 50 */ "Darkwave",
+        /* 51 */ "Techno-Industrial",
+        /* 52 */ "Electronic",
+        /* 53 */ "Pop-Folk",
+        /* 54 */ "Eurodance",
+        /* 55 */ "Dream",
+        /* 56 */ "Southern Rock",
+        /* 57 */ "Comedy",
+        /* 58 */ "Cult",
+        /* 59 */ "Gangsta",
+        /* 60 */ "Top 40",
+        /* 61 */ "Christian Rap",
+        /* 62 */ "Pop/Funk",
+        /* 63 */ "Jungle",
+        /* 64 */ "Native American",
+        /* 65 */ "Cabaret",
+        /* 66 */ "New Wave",
+        /* 67 */ "Psychadelic",
+        /* 68 */ "Rave",
+        /* 69 */ "Showtunes",
+        /* 70 */ "Trailer",
+        /* 71 */ "Lo-Fi",
+        /* 72 */ "Tribal",
+        /* 73 */ "Acid Punk",
+        /* 74 */ "Acid Jazz",
+        /* 75 */ "Polka",
+        /* 76 */ "Retro",
+        /* 77 */ "Musical",
+        /* 78 */ "Rock & Roll",
+        /* 79 */ "Hard Rock",
+        /* 80 */ "Folk",
+        /* 81 */ "Folk-Rock",
+        /* 82 */ "National Folk",
+        /* 83 */ "Swing",
+        /* 84 */ "Fast Fusion",
+        /* 85 */ "Bebob",
+        /* 86 */ "Latin",
+        /* 87 */ "Revival",
+        /* 88 */ "Celtic",
+        /* 89 */ "Bluegrass",
+        /* 90 */ "Avantgarde",
+        /* 91 */ "Gothic Rock",
+        /* 92 */ "Progressive Rock",
+        /* 93 */ "Psychedelic Rock",
+        /* 94 */ "Symphonic Rock",
+        /* 95 */ "Slow Rock",
+        /* 96 */ "Big Band",
+        /* 97 */ "Chorus",
+        /* 98 */ "Easy Listening",
+        /* 99 */ "Acoustic",
+        /* 100 */ "Humour",
+        /* 101 */ "Speech",
+        /* 102 */ "Chanson",
+        /* 103 */ "Opera",
+        /* 104 */ "Chamber Music",
+        /* 105 */ "Sonata",
+        /* 106 */ "Symphony",
+        /* 107 */ "Booty Bass",
+        /* 108 */ "Primus",
+        /* 109 */ "Porn Groove",
+        /* 110 */ "Satire",
+        /* 111 */ "Slow Jam",
+        /* 112 */ "Club",
+        /* 113 */ "Tango",
+        /* 114 */ "Samba",
+        /* 115 */ "Folklore",
+        /* 116 */ "Ballad",
+        /* 117 */ "Power Ballad",
+        /* 118 */ "Rhythmic Soul",
+        /* 119 */ "Freestyle",
+        /* 120 */ "Duet",
+        /* 121 */ "Punk Rock",
+        /* 122 */ "Drum Solo",
+        /* 123 */ "A capella",
+        /* 124 */ "Euro-House",
+        /* 125 */ "Dance Hall",
+        /* sentinel */ ""
+    };
+
+    /**
+     * Does the file contain this kind of tags?
+     */
+    boolean getTagsPresent();
+
+    String getTitle();
+
+    String getArtist();
+
+    String getAlbum();
+
+    String getComment();
+
+    String getGenre();
+
+    String getYear();
+
+    String getTrackNumber();
+
+}

Added: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java?rev=903176&view=auto
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java (added)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java Tue Jan 26 11:27:33 2010
@@ -0,0 +1,176 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.UnsupportedEncodingException;
+
+import org.apache.tika.exception.TikaException;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * This is used to parse ID3 Version 1 Tag information from an MP3 file, 
+ * if available.
+ *
+ * @see <a href="http://www.id3.org/ID3v1">MP3 ID3 Version 1 specification</a>
+ */
+public class ID3v1Handler implements ID3Tags {
+    private String title;
+    private String artist;
+    private String album;
+    private String year;
+    private String comment;
+    private String genre;
+    private String trackNumber;
+
+    boolean found = false;
+
+    public ID3v1Handler(InputStream stream, ContentHandler handler)
+            throws IOException, SAXException, TikaException {
+        byte[] tag = getSuffix(stream, 128);
+        if (tag.length == 128
+                && tag[0] == 'T' && tag[1] == 'A' && tag[2] == 'G') {
+            found = true;
+
+            title = getString(tag, 3, 33);
+            artist = getString(tag, 33, 63);
+            album = getString(tag, 63, 93);
+            year = getString(tag, 93, 97);
+            comment = getString(tag, 97, 127);
+
+            int genreID = (int) tag[127] & 0xff; // unsigned byte
+            genre = GENRES[Math.min(genreID, GENRES.length - 1)];
+
+            // ID3v1.1 Track addition
+            // If the last two bytes of the comment field are zero and
+            // non-zero, then the last byte is the track number
+            if (tag[125] == 0 && tag[126] != 0) {
+                int trackNum = (int) tag[126] & 0xff;
+                trackNumber = Integer.toString(trackNum);
+            }
+        }
+    }
+
+
+    public boolean getTagsPresent() {
+        return found;
+    }
+
+    public String getTitle() {
+        return title;
+    }
+
+    public String getArtist() {
+        return artist;
+    }
+
+    public String getAlbum() {
+        return album;
+    }
+
+    public String getYear() {
+        return year;
+    }
+
+    public String getComment() {
+        return comment;
+    }
+
+    public String getGenre() {
+        return genre;
+    }
+
+    public String getTrackNumber() {
+        return trackNumber;
+    }
+
+
+    /**
+     * Returns the identified ISO-8859-1 substring from the given byte buffer.
+     * The return value is the zero-terminated substring retrieved from
+     * between the given start and end positions in the given byte buffer.
+     * Extra whitespace (and control characters) from the beginning and the
+     * end of the substring is removed.
+     *
+     * @param buffer byte buffer
+     * @param start start index of the substring
+     * @param end end index of the substring
+     * @return the identified substring
+     * @throws TikaException if the ISO-8859-1 encoding is not available
+     */
+    private static String getString(byte[] buffer, int start, int end)
+            throws TikaException {
+        // Find the zero byte that marks the end of the string
+        int zero = start;
+        while (zero < end && buffer[zero] != 0) {
+            zero++;
+        }
+
+        // Skip trailing whitespace
+        end = zero;
+        while (start < end && buffer[end - 1] <= ' ') {
+            end--;
+        }
+
+        // Skip leading whitespace
+        while (start < end && buffer[start] <= ' ') {
+            start++;
+        }
+
+        // Return the remaining substring
+        try {
+            return new String(buffer, start, end - start, "ISO-8859-1");
+        } catch (UnsupportedEncodingException e) {
+            throw new TikaException("ISO-8859-1 encoding is not available", e);
+        }
+    }
+
+    /**
+     * Reads and returns the last <code>length</code> bytes from the
+     * given stream.
+     * @param stream input stream
+     * @param length number of bytes from the end to read and return
+     * @return stream the <code>InputStream</code> to read from.
+     * @throws IOException if the stream could not be read from.
+     */
+    private static byte[] getSuffix(InputStream stream, int length)
+            throws IOException {
+        byte[] buffer = new byte[2 * length];
+        int bytesInBuffer = 0;
+
+        int n = stream.read(buffer);
+        while (n != -1) {
+            bytesInBuffer += n;
+            if (bytesInBuffer == buffer.length) {
+                System.arraycopy(buffer, bytesInBuffer - length, buffer, 0, length);
+                bytesInBuffer = length;
+            }
+            n = stream.read(buffer, bytesInBuffer, buffer.length - bytesInBuffer);
+        }
+
+        if (bytesInBuffer < length) {
+            length = bytesInBuffer;
+        }
+
+        byte[] result = new byte[length];
+        System.arraycopy(buffer, bytesInBuffer - length, result, 0, length);
+        return result;
+    }
+
+}

Added: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v22Handler.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v22Handler.java?rev=903176&view=auto
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v22Handler.java (added)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v22Handler.java Tue Jan 26 11:27:33 2010
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.io.IOException;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.mp3.ID3v2Frame.RawTag;
+import org.apache.tika.parser.mp3.ID3v2Frame.RawTagIterator;
+import org.xml.sax.SAXException;
+
+/**
+ * This is used to parse ID3 Version 2.2 Tag information from an MP3 file,
+ * if available.
+ *
+ * @see <a href="http://id3lib.sourceforge.net/id3/id3v2-00.txt">MP3 ID3 Version 2.2 specification</a>
+ */
+public class ID3v22Handler implements ID3Tags {
+    private String title;
+    private String artist;
+    private String album;
+    private String year;
+    private String comment;
+    private String genre;
+    private String trackNumber;
+
+    public ID3v22Handler(ID3v2Frame frame)
+            throws IOException, SAXException, TikaException {
+        RawTagIterator tags = new RawV22TagIterator(frame);
+        while (tags.hasNext()) {
+            RawTag tag = tags.next();
+            if (tag.name.equals("TT2")) {
+                title = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TP1")) {
+                artist = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TAL")) {
+                album = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TYE")) {
+                year = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("COM")) {
+                comment = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TRK")) {
+                trackNumber = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TCO")) {
+                String rawGenre = getTagString(tag.data, 0, tag.data.length);
+                int open = rawGenre.indexOf("(");
+                int close = rawGenre.indexOf(")");
+                if (open < close) {
+                    try {
+                        int genreID = Integer.parseInt(rawGenre.substring(open+1, close));
+                        genre = ID3Tags.GENRES[genreID];
+                    } catch(NumberFormatException ignore) {
+                    }
+                }
+            }
+        }
+    }
+
+    private String getTagString(byte[] data, int offset, int length) {
+        return ID3v2Frame.getTagString(data, offset, length);
+    }
+
+    public boolean getTagsPresent() {
+        return true;
+    }
+
+    public String getTitle() {
+        return title;
+    }
+
+    public String getArtist() {
+        return artist;
+    }
+
+    public String getAlbum() {
+        return album;
+    }
+
+    public String getYear() {
+        return year;
+    }
+
+    public String getComment() {
+        return comment;
+    }
+
+    public String getGenre() {
+        return genre;
+    }
+
+    public String getTrackNumber() {
+        return trackNumber;
+    }
+
+    private class RawV22TagIterator extends RawTagIterator {
+        private RawV22TagIterator(ID3v2Frame frame) {
+            frame.super(3, 3, 1, 0);
+        }
+    }
+
+}

Added: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v23Handler.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v23Handler.java?rev=903176&view=auto
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v23Handler.java (added)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v23Handler.java Tue Jan 26 11:27:33 2010
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.io.IOException;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.mp3.ID3v2Frame.RawTag;
+import org.apache.tika.parser.mp3.ID3v2Frame.RawTagIterator;
+import org.xml.sax.SAXException;
+
+/**
+ * This is used to parse ID3 Version 2.3 Tag information from an MP3 file,
+ * if available.
+ *
+ * @see <a href="http://id3lib.sourceforge.net/id3/id3v2.3.0.html">MP3 ID3 Version 2.3 specification</a>
+ */
+public class ID3v23Handler implements ID3Tags {
+    private String title;
+    private String artist;
+    private String album;
+    private String year;
+    private String comment;
+    private String genre;
+    private String trackNumber;
+
+    public ID3v23Handler(ID3v2Frame frame)
+            throws IOException, SAXException, TikaException {
+        RawTagIterator tags = new RawV23TagIterator(frame);
+        while (tags.hasNext()) {
+            RawTag tag = tags.next();
+            if (tag.name.equals("TIT2")) {
+                title = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TPE1")) {
+                artist = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TALB")) {
+                album = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TYER")) {
+                year = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("COMM")) {
+                comment = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TRCK")) {
+                trackNumber = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TCON")) {
+                String rawGenre = getTagString(tag.data, 0, tag.data.length);
+                int open = rawGenre.indexOf("(");
+                int close = rawGenre.indexOf(")");
+                if (open < close) {
+                    try {
+                        int genreID = Integer.parseInt(rawGenre.substring(open+1, close));
+                        genre = ID3Tags.GENRES[genreID];
+                    } catch(NumberFormatException ignore) {
+                    }
+                }
+            }
+        }
+    }
+
+    private String getTagString(byte[] data, int offset, int length) {
+        return ID3v2Frame.getTagString(data, offset, length);
+    }
+
+    public boolean getTagsPresent() {
+        return true;
+    }
+
+    public String getTitle() {
+        return title;
+    }
+
+    public String getArtist() {
+        return artist;
+    }
+
+    public String getAlbum() {
+        return album;
+    }
+
+    public String getYear() {
+        return year;
+    }
+
+    public String getComment() {
+        return comment;
+    }
+
+    public String getGenre() {
+        return genre;
+    }
+
+    public String getTrackNumber() {
+        return trackNumber;
+    }
+
+    private class RawV23TagIterator extends RawTagIterator {
+        private RawV23TagIterator(ID3v2Frame frame) {
+            frame.super(4, 4, 1, 2);
+        }
+    }
+
+}

Added: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v24Handler.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v24Handler.java?rev=903176&view=auto
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v24Handler.java (added)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v24Handler.java Tue Jan 26 11:27:33 2010
@@ -0,0 +1,116 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.io.IOException;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.mp3.ID3v2Frame.RawTag;
+import org.apache.tika.parser.mp3.ID3v2Frame.RawTagIterator;
+import org.xml.sax.SAXException;
+
+/**
+ * This is used to parse ID3 Version 2.4 Tag information from an MP3 file,
+ * if available.
+ *
+ * @see <a href="http://www.id3.org/id3v2.4.0-structures">MP3 ID3 Version 2.4 specification</a>
+ * @see <a href="http://www.id3.org/id3v2.4.0-frames">MP3 ID3 Version 2.4 frames/tags</a>
+ */
+public class ID3v24Handler implements ID3Tags {
+    private String title;
+    private String artist;
+    private String album;
+    private String year;
+    private String comment;
+    private String genre;
+    private String trackNumber;
+
+    public ID3v24Handler(ID3v2Frame frame)
+            throws IOException, SAXException, TikaException {
+        RawTagIterator tags = new RawV24TagIterator(frame);
+        while (tags.hasNext()) {
+            RawTag tag = tags.next();
+            if (tag.name.equals("TIT2")) {
+                title = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TPE1")) {
+                artist = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TALB")) {
+                album = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TYER")) {
+                year = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("COMM")) {
+                comment = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TRCK")) {
+                trackNumber = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TCON")) {
+                String rawGenre = getTagString(tag.data, 0, tag.data.length);
+                int open = rawGenre.indexOf("(");
+                int close = rawGenre.indexOf(")");
+                if (open < close) {
+                    try {
+                        int genreID = Integer.parseInt(rawGenre.substring(open+1, close));
+                        genre = ID3Tags.GENRES[genreID];
+                    } catch(NumberFormatException ignore) {
+                    }
+                }
+            }
+        }
+    }
+
+    private String getTagString(byte[] data, int offset, int length) {
+        return ID3v2Frame.getTagString(data, offset, length);
+    }
+
+    public boolean getTagsPresent() {
+        return true;
+    }
+
+    public String getTitle() {
+        return title;
+    }
+
+    public String getArtist() {
+        return artist;
+    }
+
+    public String getAlbum() {
+        return album;
+    }
+
+    public String getYear() {
+        return year;
+    }
+
+    public String getComment() {
+        return comment;
+    }
+
+    public String getGenre() {
+        return genre;
+    }
+
+    public String getTrackNumber() {
+        return trackNumber;
+    }
+
+    private class RawV24TagIterator extends RawTagIterator {
+        private RawV24TagIterator(ID3v2Frame frame) {
+            frame.super(4, 4, 4, 2);
+        }
+    }
+
+}

Added: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java?rev=903176&view=auto
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java (added)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java Tue Jan 26 11:27:33 2010
@@ -0,0 +1,283 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.UnsupportedEncodingException;
+import java.util.Iterator;
+
+/**
+ * A frame of ID3v2 data, which is then passed to a handler to 
+ * be turned into useful data.
+ */
+public class ID3v2Frame {
+    private int majorVersion;
+    private int minorVersion;
+    private int flags;
+    private int length;
+    /** Excludes the header size part */
+    private byte[] extendedHeader;
+    private byte[] data;
+
+    public int getMajorVersion() {
+        return majorVersion;
+    }
+
+    public int getMinorVersion() {
+        return minorVersion;
+    }
+
+    public int getFlags() {
+        return flags;
+    }
+
+    public int getLength() {
+        return length;
+    }
+
+    public byte[] getExtendedHeader() {
+        return extendedHeader;
+    }
+
+    public byte[] getData() {
+        return data;
+    }
+
+    /**
+     * Returns a frame of ID3v2 data, or null if the
+     *  next data to be read from the InputStream 
+     *  doesn't correspond to an ID3v2 Frame
+     */
+    public static ID3v2Frame createFrameIfPresent(InputStream inp)
+            throws IOException {
+        int h1 = inp.read();
+        int h2 = inp.read();
+        int h3 = inp.read();
+        if (h1 == (int)'I' && h2 == (int)'D' && h3 == (int)'3') {
+            int majorVersion = inp.read();
+            int minorVersion = inp.read();
+            if (majorVersion == -1 || minorVersion == -1) {
+                return null;
+            }
+            return new ID3v2Frame(majorVersion, minorVersion, inp);
+        }
+
+        // Not a frame header
+        return null;
+    }
+
+    private ID3v2Frame(int majorVersion, int minorVersion, InputStream inp)
+            throws IOException {
+        this.majorVersion = majorVersion;
+        this.minorVersion = minorVersion;
+
+        // Get the flags and the length
+        flags = inp.read();
+        length = 4 * getInt(readFully(inp, 4));
+
+        // Do we have an extended header?
+        if ((flags & 0x02) == 0x02) {
+            int size = getInt(readFully(inp, 4));
+            extendedHeader = readFully(inp, size);
+        }
+
+        // Get the frame's data
+        data = readFully(inp, length);
+    }
+
+    protected static int getInt(byte[] data) {
+        return getInt(data, 0);
+    }
+
+    protected static int getInt(byte[] data, int offset) {
+        int b0 = data[offset+0] & 0xFF;
+        int b1 = data[offset+1] & 0xFF;
+        int b2 = data[offset+2] & 0xFF;
+        int b3 = data[offset+3] & 0xFF;
+        return (b0 << 24) + (b1 << 16) + (b2 << 8) + (b3 << 0);
+    }
+
+    protected static int getInt3(byte[] data, int offset) {
+        int b0 = data[offset+0] & 0xFF;
+        int b1 = data[offset+1] & 0xFF;
+        int b2 = data[offset+2] & 0xFF;
+        return (b0 << 16) + (b1 << 8) + (b2 << 0);
+    }
+
+    protected static int getInt2(byte[] data, int offset) {
+        int b0 = data[offset+0] & 0xFF;
+        int b1 = data[offset+1] & 0xFF;
+        return (b0 << 8) + (b1 << 0);
+    }
+
+    protected static byte[] readFully(InputStream inp, int length)
+            throws IOException {
+        byte[] b = new byte[length];
+
+        int pos = 0;
+        int read;
+        while (pos < length) {
+            read = inp.read(b, pos, length-pos);
+            if (read == -1) {
+                throw new IOException("Tried to read " + length + " bytes, but only " + pos + " bytes present"); 
+            }
+            pos += read;
+        }
+
+        return b;
+    }
+
+    /**
+     * Returns the (possibly null padded) String at the given offset and
+     * length. String encoding is held in the first byte; 
+     */
+    protected static String getTagString(byte[] data, int offset, int length) {
+        int actualLength = length;
+        while (data[actualLength-1] == 0) {
+            actualLength--;
+        }
+
+        // Does it have an encoding flag?
+        // Detect by the first byte being sub 0x20
+        String encoding = "ISO-8859-1";
+        byte maybeEncodingFlag = data[offset];
+        if (maybeEncodingFlag == 0 || maybeEncodingFlag == 1) {
+            offset++;
+            actualLength--;
+            if (maybeEncodingFlag == 1) {
+                // With BOM
+                encoding = "UTF-16";
+            } else if (maybeEncodingFlag == 2) {
+                // Without BOM
+                encoding = "UTF-16BE";
+            } else if (maybeEncodingFlag == 3) {
+                encoding = "UTF8";
+            }
+        }
+
+        try {
+            return new String(data, offset, actualLength, encoding);
+        } catch (UnsupportedEncodingException e) {
+            throw new RuntimeException(
+                    "Core encoding " + encoding + " is not available", e);
+        }
+    }
+
+    /**
+     * Returns the String at the given
+     *  offset and length. Strings are ISO-8859-1 
+     */
+    protected static String getString(byte[] data, int offset, int length) {
+        try {
+            return new String(data, offset, length, "ISO-8859-1");
+        } catch (UnsupportedEncodingException e) {
+            throw new RuntimeException(
+                    "Core encoding ISO-8859-1 encoding is not available", e);
+        }
+    }
+
+
+    /**
+     * Iterates over id3v2 raw tags.
+     * Create an instance of this that configures the
+     *  various length and multipliers.
+     */
+    protected class RawTagIterator implements Iterator<RawTag> {
+        private int nameLength;
+        private int sizeLength;
+        private int sizeMultiplier;
+        private int flagLength;
+
+        private int offset = 0;
+
+        protected RawTagIterator(
+                int nameLength, int sizeLength, int sizeMultiplier,
+                int flagLength) {
+            this.nameLength = nameLength;
+            this.sizeLength = sizeLength;
+            this.sizeMultiplier = sizeMultiplier;
+            this.flagLength = flagLength;
+        }
+
+        public boolean hasNext() {
+            if (offset < data.length) {
+                // Check for padding at the end
+                if (data[offset] != 0) {
+                    return true;
+                }
+            }
+            return false;
+        }
+
+        public RawTag next() {
+            RawTag tag = new RawTag(nameLength, sizeLength, sizeMultiplier,
+                    flagLength, data, offset);
+            offset += tag.getSize();
+            return tag;
+        }
+
+        public void remove() {
+        }
+
+    }
+
+    protected static class RawTag {
+        private int headerSize;
+        protected String name;
+        protected int flag;
+        protected byte[] data;
+
+        private RawTag(
+                int nameLength, int sizeLength, int sizeMultiplier,
+                int flagLength, byte[] frameData, int offset) {
+            headerSize = nameLength + sizeLength + flagLength;
+
+            // Name, normally 3 or 4 bytes
+            name = getString(frameData, offset, nameLength);
+
+            // Size
+            int rawSize;
+            if (sizeLength == 3) {
+                rawSize = getInt3(frameData, offset+nameLength);
+            } else {
+                rawSize = getInt(frameData, offset+nameLength);
+            }
+            int size = rawSize * sizeMultiplier;
+
+            // Flag
+            if (flagLength > 0) {
+                if (flagLength == 1) {
+                    flag = (int)frameData[offset+nameLength+sizeLength];
+                } else {
+                    flag = getInt2(frameData, offset+nameLength+sizeLength);
+                }
+            }
+
+            // Now data
+            data = new byte[size];
+            System.arraycopy(frameData, 
+                    offset+nameLength+sizeLength+flagLength, data, 0, size);
+        }
+
+        protected int getSize() {
+            return headerSize + data.length;
+        }
+
+    }
+
+}

Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java?rev=903176&r1=903175&r2=903176&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java Tue Jan 26 11:27:33 2010
@@ -18,7 +18,8 @@
 
 import java.io.IOException;
 import java.io.InputStream;
-import java.io.UnsupportedEncodingException;
+import java.util.ArrayList;
+import java.util.List;
 
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
@@ -33,144 +34,11 @@
  * from an MP3 file, if available.
  *
  * @see <a href="http://www.id3.org/ID3v1">MP3 ID3 Version 1 specification</a>
+ * @see <a href="http://www.id3.org/id3v2.4.0-structure">MP3 ID3 Version 2.4 Structure Specification</a>
+ * @see <a href="http://www.id3.org/id3v2.4.0-frames">MP3 ID3 Version 2.4 Frames Specification</a>
  */
 public class Mp3Parser implements Parser {
 
-    /**
-     * List of predefined genres.
-     *
-     * @see http://www.id3.org/id3v2-00
-     */
-    private static final String[] GENRES = new String[] {
-        /*  0 */ "Blues",
-        /*  1 */ "Classic Rock",
-        /*  2 */ "Country",
-        /*  3 */ "Dance",
-        /*  4 */ "Disco",
-        /*  5 */ "Funk",
-        /*  6 */ "Grunge",
-        /*  7 */ "Hip-Hop",
-        /*  8 */ "Jazz",
-        /*  9 */ "Metal",
-        /* 10 */ "New Age",
-        /* 11 */ "Oldies",
-        /* 12 */ "Other",
-        /* 13 */ "Pop",
-        /* 14 */ "R&B",
-        /* 15 */ "Rap",
-        /* 16 */ "Reggae",
-        /* 17 */ "Rock",
-        /* 18 */ "Techno",
-        /* 19 */ "Industrial",
-        /* 20 */ "Alternative",
-        /* 21 */ "Ska",
-        /* 22 */ "Death Metal",
-        /* 23 */ "Pranks",
-        /* 24 */ "Soundtrack",
-        /* 25 */ "Euro-Techno",
-        /* 26 */ "Ambient",
-        /* 27 */ "Trip-Hop",
-        /* 28 */ "Vocal",
-        /* 29 */ "Jazz+Funk",
-        /* 30 */ "Fusion",
-        /* 31 */ "Trance",
-        /* 32 */ "Classical",
-        /* 33 */ "Instrumental",
-        /* 34 */ "Acid",
-        /* 35 */ "House",
-        /* 36 */ "Game",
-        /* 37 */ "Sound Clip",
-        /* 38 */ "Gospel",
-        /* 39 */ "Noise",
-        /* 40 */ "AlternRock",
-        /* 41 */ "Bass",
-        /* 42 */ "Soul",
-        /* 43 */ "Punk",
-        /* 44 */ "Space",
-        /* 45 */ "Meditative",
-        /* 46 */ "Instrumental Pop",
-        /* 47 */ "Instrumental Rock",
-        /* 48 */ "Ethnic",
-        /* 49 */ "Gothic",
-        /* 50 */ "Darkwave",
-        /* 51 */ "Techno-Industrial",
-        /* 52 */ "Electronic",
-        /* 53 */ "Pop-Folk",
-        /* 54 */ "Eurodance",
-        /* 55 */ "Dream",
-        /* 56 */ "Southern Rock",
-        /* 57 */ "Comedy",
-        /* 58 */ "Cult",
-        /* 59 */ "Gangsta",
-        /* 60 */ "Top 40",
-        /* 61 */ "Christian Rap",
-        /* 62 */ "Pop/Funk",
-        /* 63 */ "Jungle",
-        /* 64 */ "Native American",
-        /* 65 */ "Cabaret",
-        /* 66 */ "New Wave",
-        /* 67 */ "Psychadelic",
-        /* 68 */ "Rave",
-        /* 69 */ "Showtunes",
-        /* 70 */ "Trailer",
-        /* 71 */ "Lo-Fi",
-        /* 72 */ "Tribal",
-        /* 73 */ "Acid Punk",
-        /* 74 */ "Acid Jazz",
-        /* 75 */ "Polka",
-        /* 76 */ "Retro",
-        /* 77 */ "Musical",
-        /* 78 */ "Rock & Roll",
-        /* 79 */ "Hard Rock",
-        /* 80 */ "Folk",
-        /* 81 */ "Folk-Rock",
-        /* 82 */ "National Folk",
-        /* 83 */ "Swing",
-        /* 84 */ "Fast Fusion",
-        /* 85 */ "Bebob",
-        /* 86 */ "Latin",
-        /* 87 */ "Revival",
-        /* 88 */ "Celtic",
-        /* 89 */ "Bluegrass",
-        /* 90 */ "Avantgarde",
-        /* 91 */ "Gothic Rock",
-        /* 92 */ "Progressive Rock",
-        /* 93 */ "Psychedelic Rock",
-        /* 94 */ "Symphonic Rock",
-        /* 95 */ "Slow Rock",
-        /* 96 */ "Big Band",
-        /* 97 */ "Chorus",
-        /* 98 */ "Easy Listening",
-        /* 99 */ "Acoustic",
-        /* 100 */ "Humour",
-        /* 101 */ "Speech",
-        /* 102 */ "Chanson",
-        /* 103 */ "Opera",
-        /* 104 */ "Chamber Music",
-        /* 105 */ "Sonata",
-        /* 106 */ "Symphony",
-        /* 107 */ "Booty Bass",
-        /* 108 */ "Primus",
-        /* 109 */ "Porn Groove",
-        /* 110 */ "Satire",
-        /* 111 */ "Slow Jam",
-        /* 112 */ "Club",
-        /* 113 */ "Tango",
-        /* 114 */ "Samba",
-        /* 115 */ "Folklore",
-        /* 116 */ "Ballad",
-        /* 117 */ "Power Ballad",
-        /* 118 */ "Rhythmic Soul",
-        /* 119 */ "Freestyle",
-        /* 120 */ "Duet",
-        /* 121 */ "Punk Rock",
-        /* 122 */ "Drum Solo",
-        /* 123 */ "A capella",
-        /* 124 */ "Euro-House",
-        /* 125 */ "Dance Hall",
-        /* sentinel */ ""
-    };
-
     public void parse(
             InputStream stream, ContentHandler handler,
             Metadata metadata, ParseContext context)
@@ -179,34 +47,28 @@
 
         XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
         xhtml.startDocument();
-
-        byte[] tag = getSuffix(stream, 128);
-        if (tag.length == 128
-                && tag[0] == 'T' && tag[1] == 'A' && tag[2] == 'G') {
-            String title = getString(tag, 3, 33);
-            String artist = getString(tag, 33, 63);
-            String album = getString(tag, 63, 93);
-            String year = getString(tag, 93, 97);
-            String comment = getString(tag, 97, 127);
-            int genre = (int) tag[127] & 0xff; // unsigned byte
-
-            metadata.set(Metadata.TITLE, title);
-            metadata.set(Metadata.AUTHOR, artist);
-
-            xhtml.element("h1", title);
-            xhtml.element("p", artist);
+        
+        // Create handlers for the various kinds of ID3 tags
+        ID3Tags[] tags = getAllTagHandlers(stream, handler);
+
+        if (tags.length > 0) {
+           CompositeTagHandler tag = new CompositeTagHandler(tags);
+           
+           metadata.set(Metadata.TITLE, tag.getTitle());
+           metadata.set(Metadata.AUTHOR, tag.getArtist());
+
+           xhtml.element("h1", tag.getTitle());
+           xhtml.element("p", tag.getArtist());
+            
             // ID3v1.1 Track addition
-            // If the last two bytes of the comment field are zero and
-            // non-zero, then the last byte is the track number
-            if (tag[125] == 0 && tag[126] != 0) {
-                int track = (int) tag[126] & 0xff;
-                xhtml.element("p", album + ", track " + track);
+            if (tag.getTrackNumber() != null) {
+                xhtml.element("p", tag.getAlbum() + ", track " + tag.getTrackNumber());
             } else {
-                xhtml.element("p", album);
+                xhtml.element("p", tag.getAlbum());
             }
-            xhtml.element("p", year);
-            xhtml.element("p", comment);
-            xhtml.element("p", GENRES[Math.min(genre, GENRES.length - 1)]);
+            xhtml.element("p", tag.getYear());
+            xhtml.element("p", tag.getComment());
+            xhtml.element("p", tag.getGenre());
         }
 
         xhtml.endDocument();
@@ -222,75 +84,51 @@
     }
 
     /**
-     * Returns the identified ISO-8859-1 substring from the given byte buffer.
-     * The return value is the zero-terminated substring retrieved from
-     * between the given start and end positions in the given byte buffer.
-     * Extra whitespace (and control characters) from the beginning and the
-     * end of the substring is removed.
-     *
-     * @param buffer byte buffer
-     * @param start start index of the substring
-     * @param end end index of the substring
-     * @return the identified substring
-     * @throws TikaException if the ISO-8859-1 encoding is not available
+     * Scans the MP3 frames for ID3 tags, and creates ID3Tag Handlers
+     *  for each supported set of tags. 
      */
-    private static String getString(byte[] buffer, int start, int end)
-            throws TikaException {
-        // Find the zero byte that marks the end of the string
-        int zero = start;
-        while (zero < end && buffer[zero] != 0) {
-            zero++;
-        }
-
-        // Skip trailing whitespace
-        end = zero;
-        while (start < end && buffer[end - 1] <= ' ') {
-            end--;
-        }
-
-        // Skip leading whitespace
-        while (start < end && buffer[start] <= ' ') {
-            start++;
-        }
+    protected ID3Tags[] getAllTagHandlers(InputStream stream, ContentHandler handler)
+           throws IOException, SAXException, TikaException {
+       ID3v24Handler v24 = null;
+       ID3v23Handler v23 = null;
+       ID3v22Handler v22 = null;
+       ID3v1Handler v1 = null;
+
+       // ID3v2 tags live at the start of the file
+       // You can apparently have several different ID3 tag blocks
+       // So, keep going until we don't find any more
+       ID3v2Frame f;
+       while ((f = ID3v2Frame.createFrameIfPresent(stream)) != null) {
+          if (f.getMajorVersion() == 4) {
+             v24 = new ID3v24Handler(f);
+          } else if(f.getMajorVersion() == 3) {
+             v23 = new ID3v23Handler(f);
+          } else if(f.getMajorVersion() == 2) {
+             v22 = new ID3v22Handler(f);
+          }
+       }
 
-        // Return the remaining substring
-        try {
-            return new String(buffer, start, end - start, "ISO-8859-1");
-        } catch (UnsupportedEncodingException e) {
-            throw new TikaException("ISO-8859-1 encoding is not available", e);
-        }
-    }
+       // ID3v1 tags live at the end of the file
+       // Just let the handler run until it's finished
+       v1 = new ID3v1Handler(stream, handler);
+
+       // Go in order of preference
+       // Currently, that's newest to oldest
+       List<ID3Tags> tags = new ArrayList<ID3Tags>();
 
-    /**
-     * Reads and returns the last <code>length</code> bytes from the
-     * given stream.
-     * @param stream input stream
-     * @param length number of bytes from the end to read and return
-     * @return stream the <code>InputStream</code> to read from.
-     * @throws IOException if the stream could not be read from.
-     */
-   private static byte[] getSuffix(InputStream stream, int length)
-           throws IOException {
-       byte[] buffer = new byte[2 * length];
-       int bytesInBuffer = 0;
-
-       int n = stream.read(buffer);
-       while (n != -1) {
-           bytesInBuffer += n;
-           if (bytesInBuffer == buffer.length) {
-               System.arraycopy(buffer, bytesInBuffer - length, buffer, 0, length);
-               bytesInBuffer = length;
-           }
-           n = stream.read(buffer, bytesInBuffer, buffer.length - bytesInBuffer);
+       if(v24 != null && v24.getTagsPresent()) {
+          tags.add(v24);
        }
-
-       if (bytesInBuffer < length) {
-           length = bytesInBuffer;
+       if(v23 != null && v23.getTagsPresent()) {
+          tags.add(v23);
        }
-
-       byte[] result = new byte[length];
-       System.arraycopy(buffer, bytesInBuffer - length, result, 0, length);
-       return result;
-   }
+       if(v22 != null && v22.getTagsPresent()) {
+          tags.add(v22);
+       }
+       if(v1 != null && v1.getTagsPresent()) {
+          tags.add(v1);
+       }
+       return tags.toArray(new ID3Tags[tags.size()]);
+    }
 
 }

Modified: lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/TestParsers.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/TestParsers.java?rev=903176&r1=903175&r2=903176&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/TestParsers.java (original)
+++ lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/TestParsers.java Tue Jan 26 11:27:33 2010
@@ -161,7 +161,7 @@
     }
 
     public void testMP3Extraction() throws Exception {
-        File file = getResourceAsFile("/test-documents/testMP3.mp3");
+        File file = getResourceAsFile("/test-documents/testMP3id3v1.mp3");
         String s1 = ParseUtils.getStringContent(file, tc);
         String s2 = ParseUtils.getStringContent(file, tc, "audio/mpeg");
         assertEquals(s1, s2);

Modified: lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java?rev=903176&r1=903175&r2=903176&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java (original)
+++ lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java Tue Jan 26 11:27:33 2010
@@ -16,6 +16,7 @@
  */
 package org.apache.tika.parser.mp3;
 
+import java.io.ByteArrayInputStream;
 import java.io.InputStream;
 
 import junit.framework.TestCase;
@@ -31,13 +32,16 @@
  */
 public class Mp3ParserTest extends TestCase {
 
-    public void testMp3Parsing() throws Exception {
+    /**
+     * Test that with only ID3v1 tags, we get some information out   
+     */
+    public void testMp3ParsingID3v1() throws Exception {
         Parser parser = new AutoDetectParser(); // Should auto-detect!
         ContentHandler handler = new BodyContentHandler();
         Metadata metadata = new Metadata();
 
         InputStream stream = Mp3ParserTest.class.getResourceAsStream(
-                "/test-documents/testMP3.mp3");
+                "/test-documents/testMP3id3v1.mp3");
         try {
             parser.parse(stream, handler, metadata);
         } finally {
@@ -57,4 +61,80 @@
         assertTrue(content.contains("Rock"));
     }
 
+    /**
+     * Test that with only ID3v2 tags, we get the full
+     *  set of information out.
+     */
+    public void testMp3ParsingID3v2() throws Exception {
+        Parser parser = new AutoDetectParser(); // Should auto-detect!
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        InputStream stream = Mp3ParserTest.class.getResourceAsStream(
+                "/test-documents/testMP3id3v2.mp3");
+        try {
+            parser.parse(stream, handler, metadata);
+        } finally {
+            stream.close();
+        }
+
+        assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("Test Title", metadata.get(Metadata.TITLE));
+        assertEquals("Test Artist", metadata.get(Metadata.AUTHOR));
+
+        String content = handler.toString();
+        assertTrue(content.contains("Test Title"));
+        assertTrue(content.contains("Test Artist"));
+        assertTrue(content.contains("Test Album"));
+        assertTrue(content.contains("2008"));
+        assertTrue(content.contains("Test Comment"));
+        assertTrue(content.contains("Rock"));
+    }
+
+    /**
+     * Test that with both id3v2 and id3v1, we prefer the
+     *  details from id3v2
+     */
+    public void testMp3ParsingID3v1v2() throws Exception {
+        Parser parser = new AutoDetectParser(); // Should auto-detect!
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        InputStream stream = Mp3ParserTest.class.getResourceAsStream(
+                "/test-documents/testMP3id3v1_v2.mp3");
+        try {
+            parser.parse(stream, handler, metadata);
+        } finally {
+            stream.close();
+        }
+
+        assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("Test Title", metadata.get(Metadata.TITLE));
+        assertEquals("Test Artist", metadata.get(Metadata.AUTHOR));
+
+        String content = handler.toString();
+        assertTrue(content.contains("Test Title"));
+        assertTrue(content.contains("Test Artist"));
+        assertTrue(content.contains("Test Album"));
+        assertTrue(content.contains("2008"));
+        assertTrue(content.contains("Test Comment"));
+        assertTrue(content.contains("Rock"));
+    }
+
+    public void testID3v2Frame() throws Exception {
+       byte[] empty = new byte[] {
+             0x49, 0x44, 0x33, 3, 1, 0,
+             0, 0, 0, 0
+       };
+       
+       assertEquals(11, ID3v2Frame.getInt(new byte[] {0,0,0,0x0b}));
+       assertEquals(257, ID3v2Frame.getInt(new byte[] {0,0,1,1}));
+       
+       ID3v2Frame f = ID3v2Frame.createFrameIfPresent(new ByteArrayInputStream(empty));
+       assertEquals(3, f.getMajorVersion());
+       assertEquals(1, f.getMinorVersion());
+       assertEquals(0, f.getFlags());
+       assertEquals(0, f.getLength());
+       assertEquals(0, f.getData().length);
+    }
 }

Added: lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testMP3id3v1_v2.mp3
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testMP3id3v1_v2.mp3?rev=903176&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testMP3id3v1_v2.mp3
------------------------------------------------------------------------------
    svn:executable = *

Propchange: lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testMP3id3v1_v2.mp3
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testMP3id3v2.mp3
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testMP3id3v2.mp3?rev=903176&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testMP3id3v2.mp3
------------------------------------------------------------------------------
    svn:executable = *

Propchange: lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testMP3id3v2.mp3
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream