You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2010/01/26 12:27:34 UTC
svn commit: r903176 - in /lucene/tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/mp3/ test/java/org/apache/tika/
test/java/org/apache/tika/parser/mp3/ test/resources/test-documents/
Author: jukka
Date: Tue Jan 26 11:27:33 2010
New Revision: 903176
URL: http://svn.apache.org/viewvc?rev=903176&view=rev
Log:
TIKA-368: ID3v2 support for mp3 parser
Patch by Nick Burch
Added:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/CompositeTagHandler.java
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3Tags.java
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v22Handler.java
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v23Handler.java
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v24Handler.java
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testMP3id3v1.mp3
- copied unchanged from r903148, lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testMP3.mp3
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testMP3id3v1_v2.mp3 (with props)
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testMP3id3v2.mp3 (with props)
Removed:
lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testMP3.mp3
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/TestParsers.java
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java
Added: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/CompositeTagHandler.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/CompositeTagHandler.java?rev=903176&view=auto
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/CompositeTagHandler.java (added)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/CompositeTagHandler.java Tue Jan 26 11:27:33 2010
@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+/**
+ * Takes an array of {@link ID3Tags} in preference order, and when asked for
+ * a given tag, will return it from the first {@link ID3Tags} that has it.
+ */
+public class CompositeTagHandler implements ID3Tags {
+
+ private ID3Tags[] tags;
+
+ public CompositeTagHandler(ID3Tags[] tags) {
+ this.tags = tags;
+ }
+
+ public boolean getTagsPresent() {
+ for (ID3Tags tag : tags) {
+ if (tag.getTagsPresent()) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ public String getTitle() {
+ for (ID3Tags tag : tags) {
+ if (tag.getTitle() != null) {
+ return tag.getTitle();
+ }
+ }
+ return null;
+ }
+
+ public String getArtist() {
+ for (ID3Tags tag : tags) {
+ if (tag.getArtist() != null) {
+ return tag.getArtist();
+ }
+ }
+ return null;
+ }
+
+ public String getAlbum() {
+ for (ID3Tags tag : tags) {
+ if (tag.getAlbum() != null) {
+ return tag.getAlbum();
+ }
+ }
+ return null;
+ }
+
+ public String getYear() {
+ for (ID3Tags tag : tags) {
+ if (tag.getYear() != null) {
+ return tag.getYear();
+ }
+ }
+ return null;
+ }
+
+ public String getComment() {
+ for (ID3Tags tag : tags) {
+ if (tag.getComment() != null) {
+ return tag.getComment();
+ }
+ }
+ return null;
+ }
+
+ public String getGenre() {
+ for (ID3Tags tag : tags) {
+ if (tag.getGenre() != null) {
+ return tag.getGenre();
+ }
+ }
+ return null;
+ }
+
+ public String getTrackNumber() {
+ for (ID3Tags tag : tags) {
+ if (tag.getTrackNumber() != null) {
+ return tag.getTrackNumber();
+ }
+ }
+ return null;
+ }
+
+}
Added: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3Tags.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3Tags.java?rev=903176&view=auto
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3Tags.java (added)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3Tags.java Tue Jan 26 11:27:33 2010
@@ -0,0 +1,185 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+
+/**
+ * Interface that defines the common interface for ID3 tag parsers,
+ * such as ID3v1 and ID3v2.3.
+ * Implementations should return NULL if the file lacks a given
+ * tag, or if the tag isn't defined for the version.
+ *
+ * Note that so far, only the ID3v1 core tags are listed here. In
+ * future, we may wish to add more to cover the extra tags that
+ * our ID3v2 handlers can produce.
+ */
+public interface ID3Tags {
+ /**
+ * List of predefined genres.
+ *
+ * @see http://www.id3.org/id3v2-00
+ */
+ String[] GENRES = new String[] {
+ /* 0 */ "Blues",
+ /* 1 */ "Classic Rock",
+ /* 2 */ "Country",
+ /* 3 */ "Dance",
+ /* 4 */ "Disco",
+ /* 5 */ "Funk",
+ /* 6 */ "Grunge",
+ /* 7 */ "Hip-Hop",
+ /* 8 */ "Jazz",
+ /* 9 */ "Metal",
+ /* 10 */ "New Age",
+ /* 11 */ "Oldies",
+ /* 12 */ "Other",
+ /* 13 */ "Pop",
+ /* 14 */ "R&B",
+ /* 15 */ "Rap",
+ /* 16 */ "Reggae",
+ /* 17 */ "Rock",
+ /* 18 */ "Techno",
+ /* 19 */ "Industrial",
+ /* 20 */ "Alternative",
+ /* 21 */ "Ska",
+ /* 22 */ "Death Metal",
+ /* 23 */ "Pranks",
+ /* 24 */ "Soundtrack",
+ /* 25 */ "Euro-Techno",
+ /* 26 */ "Ambient",
+ /* 27 */ "Trip-Hop",
+ /* 28 */ "Vocal",
+ /* 29 */ "Jazz+Funk",
+ /* 30 */ "Fusion",
+ /* 31 */ "Trance",
+ /* 32 */ "Classical",
+ /* 33 */ "Instrumental",
+ /* 34 */ "Acid",
+ /* 35 */ "House",
+ /* 36 */ "Game",
+ /* 37 */ "Sound Clip",
+ /* 38 */ "Gospel",
+ /* 39 */ "Noise",
+ /* 40 */ "AlternRock",
+ /* 41 */ "Bass",
+ /* 42 */ "Soul",
+ /* 43 */ "Punk",
+ /* 44 */ "Space",
+ /* 45 */ "Meditative",
+ /* 46 */ "Instrumental Pop",
+ /* 47 */ "Instrumental Rock",
+ /* 48 */ "Ethnic",
+ /* 49 */ "Gothic",
+ /* 50 */ "Darkwave",
+ /* 51 */ "Techno-Industrial",
+ /* 52 */ "Electronic",
+ /* 53 */ "Pop-Folk",
+ /* 54 */ "Eurodance",
+ /* 55 */ "Dream",
+ /* 56 */ "Southern Rock",
+ /* 57 */ "Comedy",
+ /* 58 */ "Cult",
+ /* 59 */ "Gangsta",
+ /* 60 */ "Top 40",
+ /* 61 */ "Christian Rap",
+ /* 62 */ "Pop/Funk",
+ /* 63 */ "Jungle",
+ /* 64 */ "Native American",
+ /* 65 */ "Cabaret",
+ /* 66 */ "New Wave",
+ /* 67 */ "Psychadelic",
+ /* 68 */ "Rave",
+ /* 69 */ "Showtunes",
+ /* 70 */ "Trailer",
+ /* 71 */ "Lo-Fi",
+ /* 72 */ "Tribal",
+ /* 73 */ "Acid Punk",
+ /* 74 */ "Acid Jazz",
+ /* 75 */ "Polka",
+ /* 76 */ "Retro",
+ /* 77 */ "Musical",
+ /* 78 */ "Rock & Roll",
+ /* 79 */ "Hard Rock",
+ /* 80 */ "Folk",
+ /* 81 */ "Folk-Rock",
+ /* 82 */ "National Folk",
+ /* 83 */ "Swing",
+ /* 84 */ "Fast Fusion",
+ /* 85 */ "Bebob",
+ /* 86 */ "Latin",
+ /* 87 */ "Revival",
+ /* 88 */ "Celtic",
+ /* 89 */ "Bluegrass",
+ /* 90 */ "Avantgarde",
+ /* 91 */ "Gothic Rock",
+ /* 92 */ "Progressive Rock",
+ /* 93 */ "Psychedelic Rock",
+ /* 94 */ "Symphonic Rock",
+ /* 95 */ "Slow Rock",
+ /* 96 */ "Big Band",
+ /* 97 */ "Chorus",
+ /* 98 */ "Easy Listening",
+ /* 99 */ "Acoustic",
+ /* 100 */ "Humour",
+ /* 101 */ "Speech",
+ /* 102 */ "Chanson",
+ /* 103 */ "Opera",
+ /* 104 */ "Chamber Music",
+ /* 105 */ "Sonata",
+ /* 106 */ "Symphony",
+ /* 107 */ "Booty Bass",
+ /* 108 */ "Primus",
+ /* 109 */ "Porn Groove",
+ /* 110 */ "Satire",
+ /* 111 */ "Slow Jam",
+ /* 112 */ "Club",
+ /* 113 */ "Tango",
+ /* 114 */ "Samba",
+ /* 115 */ "Folklore",
+ /* 116 */ "Ballad",
+ /* 117 */ "Power Ballad",
+ /* 118 */ "Rhythmic Soul",
+ /* 119 */ "Freestyle",
+ /* 120 */ "Duet",
+ /* 121 */ "Punk Rock",
+ /* 122 */ "Drum Solo",
+ /* 123 */ "A capella",
+ /* 124 */ "Euro-House",
+ /* 125 */ "Dance Hall",
+ /* sentinel */ ""
+ };
+
+ /**
+ * Does the file contain this kind of tags?
+ */
+ boolean getTagsPresent();
+
+ String getTitle();
+
+ String getArtist();
+
+ String getAlbum();
+
+ String getComment();
+
+ String getGenre();
+
+ String getYear();
+
+ String getTrackNumber();
+
+}
Added: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java?rev=903176&view=auto
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java (added)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java Tue Jan 26 11:27:33 2010
@@ -0,0 +1,176 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.UnsupportedEncodingException;
+
+import org.apache.tika.exception.TikaException;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * This is used to parse ID3 Version 1 Tag information from an MP3 file,
+ * if available.
+ *
+ * @see <a href="http://www.id3.org/ID3v1">MP3 ID3 Version 1 specification</a>
+ */
+public class ID3v1Handler implements ID3Tags {
+ private String title;
+ private String artist;
+ private String album;
+ private String year;
+ private String comment;
+ private String genre;
+ private String trackNumber;
+
+ boolean found = false;
+
+ public ID3v1Handler(InputStream stream, ContentHandler handler)
+ throws IOException, SAXException, TikaException {
+ byte[] tag = getSuffix(stream, 128);
+ if (tag.length == 128
+ && tag[0] == 'T' && tag[1] == 'A' && tag[2] == 'G') {
+ found = true;
+
+ title = getString(tag, 3, 33);
+ artist = getString(tag, 33, 63);
+ album = getString(tag, 63, 93);
+ year = getString(tag, 93, 97);
+ comment = getString(tag, 97, 127);
+
+ int genreID = (int) tag[127] & 0xff; // unsigned byte
+ genre = GENRES[Math.min(genreID, GENRES.length - 1)];
+
+ // ID3v1.1 Track addition
+ // If the last two bytes of the comment field are zero and
+ // non-zero, then the last byte is the track number
+ if (tag[125] == 0 && tag[126] != 0) {
+ int trackNum = (int) tag[126] & 0xff;
+ trackNumber = Integer.toString(trackNum);
+ }
+ }
+ }
+
+
+ public boolean getTagsPresent() {
+ return found;
+ }
+
+ public String getTitle() {
+ return title;
+ }
+
+ public String getArtist() {
+ return artist;
+ }
+
+ public String getAlbum() {
+ return album;
+ }
+
+ public String getYear() {
+ return year;
+ }
+
+ public String getComment() {
+ return comment;
+ }
+
+ public String getGenre() {
+ return genre;
+ }
+
+ public String getTrackNumber() {
+ return trackNumber;
+ }
+
+
+ /**
+ * Returns the identified ISO-8859-1 substring from the given byte buffer.
+ * The return value is the zero-terminated substring retrieved from
+ * between the given start and end positions in the given byte buffer.
+ * Extra whitespace (and control characters) from the beginning and the
+ * end of the substring is removed.
+ *
+ * @param buffer byte buffer
+ * @param start start index of the substring
+ * @param end end index of the substring
+ * @return the identified substring
+ * @throws TikaException if the ISO-8859-1 encoding is not available
+ */
+ private static String getString(byte[] buffer, int start, int end)
+ throws TikaException {
+ // Find the zero byte that marks the end of the string
+ int zero = start;
+ while (zero < end && buffer[zero] != 0) {
+ zero++;
+ }
+
+ // Skip trailing whitespace
+ end = zero;
+ while (start < end && buffer[end - 1] <= ' ') {
+ end--;
+ }
+
+ // Skip leading whitespace
+ while (start < end && buffer[start] <= ' ') {
+ start++;
+ }
+
+ // Return the remaining substring
+ try {
+ return new String(buffer, start, end - start, "ISO-8859-1");
+ } catch (UnsupportedEncodingException e) {
+ throw new TikaException("ISO-8859-1 encoding is not available", e);
+ }
+ }
+
+ /**
+ * Reads and returns the last <code>length</code> bytes from the
+ * given stream.
+ * @param stream input stream
+ * @param length number of bytes from the end to read and return
+ * @return stream the <code>InputStream</code> to read from.
+ * @throws IOException if the stream could not be read from.
+ */
+ private static byte[] getSuffix(InputStream stream, int length)
+ throws IOException {
+ byte[] buffer = new byte[2 * length];
+ int bytesInBuffer = 0;
+
+ int n = stream.read(buffer);
+ while (n != -1) {
+ bytesInBuffer += n;
+ if (bytesInBuffer == buffer.length) {
+ System.arraycopy(buffer, bytesInBuffer - length, buffer, 0, length);
+ bytesInBuffer = length;
+ }
+ n = stream.read(buffer, bytesInBuffer, buffer.length - bytesInBuffer);
+ }
+
+ if (bytesInBuffer < length) {
+ length = bytesInBuffer;
+ }
+
+ byte[] result = new byte[length];
+ System.arraycopy(buffer, bytesInBuffer - length, result, 0, length);
+ return result;
+ }
+
+}
Added: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v22Handler.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v22Handler.java?rev=903176&view=auto
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v22Handler.java (added)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v22Handler.java Tue Jan 26 11:27:33 2010
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.io.IOException;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.mp3.ID3v2Frame.RawTag;
+import org.apache.tika.parser.mp3.ID3v2Frame.RawTagIterator;
+import org.xml.sax.SAXException;
+
+/**
+ * This is used to parse ID3 Version 2.2 Tag information from an MP3 file,
+ * if available.
+ *
+ * @see <a href="http://id3lib.sourceforge.net/id3/id3v2-00.txt">MP3 ID3 Version 2.2 specification</a>
+ */
+public class ID3v22Handler implements ID3Tags {
+ private String title;
+ private String artist;
+ private String album;
+ private String year;
+ private String comment;
+ private String genre;
+ private String trackNumber;
+
+ public ID3v22Handler(ID3v2Frame frame)
+ throws IOException, SAXException, TikaException {
+ RawTagIterator tags = new RawV22TagIterator(frame);
+ while (tags.hasNext()) {
+ RawTag tag = tags.next();
+ if (tag.name.equals("TT2")) {
+ title = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("TP1")) {
+ artist = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("TAL")) {
+ album = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("TYE")) {
+ year = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("COM")) {
+ comment = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("TRK")) {
+ trackNumber = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("TCO")) {
+ String rawGenre = getTagString(tag.data, 0, tag.data.length);
+ int open = rawGenre.indexOf("(");
+ int close = rawGenre.indexOf(")");
+ if (open < close) {
+ try {
+ int genreID = Integer.parseInt(rawGenre.substring(open+1, close));
+ genre = ID3Tags.GENRES[genreID];
+ } catch(NumberFormatException ignore) {
+ }
+ }
+ }
+ }
+ }
+
+ private String getTagString(byte[] data, int offset, int length) {
+ return ID3v2Frame.getTagString(data, offset, length);
+ }
+
+ public boolean getTagsPresent() {
+ return true;
+ }
+
+ public String getTitle() {
+ return title;
+ }
+
+ public String getArtist() {
+ return artist;
+ }
+
+ public String getAlbum() {
+ return album;
+ }
+
+ public String getYear() {
+ return year;
+ }
+
+ public String getComment() {
+ return comment;
+ }
+
+ public String getGenre() {
+ return genre;
+ }
+
+ public String getTrackNumber() {
+ return trackNumber;
+ }
+
+ private class RawV22TagIterator extends RawTagIterator {
+ private RawV22TagIterator(ID3v2Frame frame) {
+ frame.super(3, 3, 1, 0);
+ }
+ }
+
+}
Added: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v23Handler.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v23Handler.java?rev=903176&view=auto
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v23Handler.java (added)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v23Handler.java Tue Jan 26 11:27:33 2010
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.io.IOException;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.mp3.ID3v2Frame.RawTag;
+import org.apache.tika.parser.mp3.ID3v2Frame.RawTagIterator;
+import org.xml.sax.SAXException;
+
+/**
+ * This is used to parse ID3 Version 2.3 Tag information from an MP3 file,
+ * if available.
+ *
+ * @see <a href="http://id3lib.sourceforge.net/id3/id3v2.3.0.html">MP3 ID3 Version 2.3 specification</a>
+ */
+public class ID3v23Handler implements ID3Tags {
+ private String title;
+ private String artist;
+ private String album;
+ private String year;
+ private String comment;
+ private String genre;
+ private String trackNumber;
+
+ public ID3v23Handler(ID3v2Frame frame)
+ throws IOException, SAXException, TikaException {
+ RawTagIterator tags = new RawV23TagIterator(frame);
+ while (tags.hasNext()) {
+ RawTag tag = tags.next();
+ if (tag.name.equals("TIT2")) {
+ title = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("TPE1")) {
+ artist = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("TALB")) {
+ album = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("TYER")) {
+ year = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("COMM")) {
+ comment = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("TRCK")) {
+ trackNumber = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("TCON")) {
+ String rawGenre = getTagString(tag.data, 0, tag.data.length);
+ int open = rawGenre.indexOf("(");
+ int close = rawGenre.indexOf(")");
+ if (open < close) {
+ try {
+ int genreID = Integer.parseInt(rawGenre.substring(open+1, close));
+ genre = ID3Tags.GENRES[genreID];
+ } catch(NumberFormatException ignore) {
+ }
+ }
+ }
+ }
+ }
+
+ private String getTagString(byte[] data, int offset, int length) {
+ return ID3v2Frame.getTagString(data, offset, length);
+ }
+
+ public boolean getTagsPresent() {
+ return true;
+ }
+
+ public String getTitle() {
+ return title;
+ }
+
+ public String getArtist() {
+ return artist;
+ }
+
+ public String getAlbum() {
+ return album;
+ }
+
+ public String getYear() {
+ return year;
+ }
+
+ public String getComment() {
+ return comment;
+ }
+
+ public String getGenre() {
+ return genre;
+ }
+
+ public String getTrackNumber() {
+ return trackNumber;
+ }
+
+ private class RawV23TagIterator extends RawTagIterator {
+ private RawV23TagIterator(ID3v2Frame frame) {
+ frame.super(4, 4, 1, 2);
+ }
+ }
+
+}
Added: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v24Handler.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v24Handler.java?rev=903176&view=auto
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v24Handler.java (added)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v24Handler.java Tue Jan 26 11:27:33 2010
@@ -0,0 +1,116 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.io.IOException;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.mp3.ID3v2Frame.RawTag;
+import org.apache.tika.parser.mp3.ID3v2Frame.RawTagIterator;
+import org.xml.sax.SAXException;
+
+/**
+ * This is used to parse ID3 Version 2.4 Tag information from an MP3 file,
+ * if available.
+ *
+ * @see <a href="http://www.id3.org/id3v2.4.0-structures">MP3 ID3 Version 2.4 specification</a>
+ * @see <a href="http://www.id3.org/id3v2.4.0-frames">MP3 ID3 Version 2.4 frames/tags</a>
+ */
+public class ID3v24Handler implements ID3Tags {
+ private String title;
+ private String artist;
+ private String album;
+ private String year;
+ private String comment;
+ private String genre;
+ private String trackNumber;
+
+ public ID3v24Handler(ID3v2Frame frame)
+ throws IOException, SAXException, TikaException {
+ RawTagIterator tags = new RawV24TagIterator(frame);
+ while (tags.hasNext()) {
+ RawTag tag = tags.next();
+ if (tag.name.equals("TIT2")) {
+ title = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("TPE1")) {
+ artist = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("TALB")) {
+ album = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("TYER")) {
+ year = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("COMM")) {
+ comment = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("TRCK")) {
+ trackNumber = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("TCON")) {
+ String rawGenre = getTagString(tag.data, 0, tag.data.length);
+ int open = rawGenre.indexOf("(");
+ int close = rawGenre.indexOf(")");
+ if (open < close) {
+ try {
+ int genreID = Integer.parseInt(rawGenre.substring(open+1, close));
+ genre = ID3Tags.GENRES[genreID];
+ } catch(NumberFormatException ignore) {
+ }
+ }
+ }
+ }
+ }
+
+ private String getTagString(byte[] data, int offset, int length) {
+ return ID3v2Frame.getTagString(data, offset, length);
+ }
+
+ public boolean getTagsPresent() {
+ return true;
+ }
+
+ public String getTitle() {
+ return title;
+ }
+
+ public String getArtist() {
+ return artist;
+ }
+
+ public String getAlbum() {
+ return album;
+ }
+
+ public String getYear() {
+ return year;
+ }
+
+ public String getComment() {
+ return comment;
+ }
+
+ public String getGenre() {
+ return genre;
+ }
+
+ public String getTrackNumber() {
+ return trackNumber;
+ }
+
+ private class RawV24TagIterator extends RawTagIterator {
+ private RawV24TagIterator(ID3v2Frame frame) {
+ frame.super(4, 4, 4, 2);
+ }
+ }
+
+}
Added: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java?rev=903176&view=auto
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java (added)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java Tue Jan 26 11:27:33 2010
@@ -0,0 +1,283 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.UnsupportedEncodingException;
+import java.util.Iterator;
+
+/**
+ * A frame of ID3v2 data, which is then passed to a handler to
+ * be turned into useful data.
+ */
+public class ID3v2Frame {
+ private int majorVersion;
+ private int minorVersion;
+ private int flags;
+ private int length;
+ /** Excludes the header size part */
+ private byte[] extendedHeader;
+ private byte[] data;
+
+ public int getMajorVersion() {
+ return majorVersion;
+ }
+
+ public int getMinorVersion() {
+ return minorVersion;
+ }
+
+ public int getFlags() {
+ return flags;
+ }
+
+ public int getLength() {
+ return length;
+ }
+
+ public byte[] getExtendedHeader() {
+ return extendedHeader;
+ }
+
+ public byte[] getData() {
+ return data;
+ }
+
+ /**
+ * Returns a frame of ID3v2 data, or null if the
+ * next data to be read from the InputStream
+ * doesn't correspond to an ID3v2 Frame
+ */
+ public static ID3v2Frame createFrameIfPresent(InputStream inp)
+ throws IOException {
+ int h1 = inp.read();
+ int h2 = inp.read();
+ int h3 = inp.read();
+ if (h1 == (int)'I' && h2 == (int)'D' && h3 == (int)'3') {
+ int majorVersion = inp.read();
+ int minorVersion = inp.read();
+ if (majorVersion == -1 || minorVersion == -1) {
+ return null;
+ }
+ return new ID3v2Frame(majorVersion, minorVersion, inp);
+ }
+
+ // Not a frame header
+ return null;
+ }
+
+ private ID3v2Frame(int majorVersion, int minorVersion, InputStream inp)
+ throws IOException {
+ this.majorVersion = majorVersion;
+ this.minorVersion = minorVersion;
+
+ // Get the flags and the length
+ flags = inp.read();
+ length = 4 * getInt(readFully(inp, 4));
+
+ // Do we have an extended header?
+ if ((flags & 0x02) == 0x02) {
+ int size = getInt(readFully(inp, 4));
+ extendedHeader = readFully(inp, size);
+ }
+
+ // Get the frame's data
+ data = readFully(inp, length);
+ }
+
+ protected static int getInt(byte[] data) {
+ return getInt(data, 0);
+ }
+
+ protected static int getInt(byte[] data, int offset) {
+ int b0 = data[offset+0] & 0xFF;
+ int b1 = data[offset+1] & 0xFF;
+ int b2 = data[offset+2] & 0xFF;
+ int b3 = data[offset+3] & 0xFF;
+ return (b0 << 24) + (b1 << 16) + (b2 << 8) + (b3 << 0);
+ }
+
+ protected static int getInt3(byte[] data, int offset) {
+ int b0 = data[offset+0] & 0xFF;
+ int b1 = data[offset+1] & 0xFF;
+ int b2 = data[offset+2] & 0xFF;
+ return (b0 << 16) + (b1 << 8) + (b2 << 0);
+ }
+
+ protected static int getInt2(byte[] data, int offset) {
+ int b0 = data[offset+0] & 0xFF;
+ int b1 = data[offset+1] & 0xFF;
+ return (b0 << 8) + (b1 << 0);
+ }
+
+ protected static byte[] readFully(InputStream inp, int length)
+ throws IOException {
+ byte[] b = new byte[length];
+
+ int pos = 0;
+ int read;
+ while (pos < length) {
+ read = inp.read(b, pos, length-pos);
+ if (read == -1) {
+ throw new IOException("Tried to read " + length + " bytes, but only " + pos + " bytes present");
+ }
+ pos += read;
+ }
+
+ return b;
+ }
+
+ /**
+ * Returns the (possibly null padded) String at the given offset and
+ * length. String encoding is held in the first byte;
+ */
+ protected static String getTagString(byte[] data, int offset, int length) {
+ int actualLength = length;
+ while (data[actualLength-1] == 0) {
+ actualLength--;
+ }
+
+ // Does it have an encoding flag?
+ // Detect by the first byte being sub 0x20
+ String encoding = "ISO-8859-1";
+ byte maybeEncodingFlag = data[offset];
+ if (maybeEncodingFlag == 0 || maybeEncodingFlag == 1) {
+ offset++;
+ actualLength--;
+ if (maybeEncodingFlag == 1) {
+ // With BOM
+ encoding = "UTF-16";
+ } else if (maybeEncodingFlag == 2) {
+ // Without BOM
+ encoding = "UTF-16BE";
+ } else if (maybeEncodingFlag == 3) {
+ encoding = "UTF8";
+ }
+ }
+
+ try {
+ return new String(data, offset, actualLength, encoding);
+ } catch (UnsupportedEncodingException e) {
+ throw new RuntimeException(
+ "Core encoding " + encoding + " is not available", e);
+ }
+ }
+
+ /**
+ * Returns the String at the given
+ * offset and length. Strings are ISO-8859-1
+ */
+ protected static String getString(byte[] data, int offset, int length) {
+ try {
+ return new String(data, offset, length, "ISO-8859-1");
+ } catch (UnsupportedEncodingException e) {
+ throw new RuntimeException(
+ "Core encoding ISO-8859-1 encoding is not available", e);
+ }
+ }
+
+
+ /**
+ * Iterates over id3v2 raw tags.
+ * Create an instance of this that configures the
+ * various length and multipliers.
+ */
+ protected class RawTagIterator implements Iterator<RawTag> {
+ private int nameLength;
+ private int sizeLength;
+ private int sizeMultiplier;
+ private int flagLength;
+
+ private int offset = 0;
+
+ protected RawTagIterator(
+ int nameLength, int sizeLength, int sizeMultiplier,
+ int flagLength) {
+ this.nameLength = nameLength;
+ this.sizeLength = sizeLength;
+ this.sizeMultiplier = sizeMultiplier;
+ this.flagLength = flagLength;
+ }
+
+ public boolean hasNext() {
+ if (offset < data.length) {
+ // Check for padding at the end
+ if (data[offset] != 0) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ public RawTag next() {
+ RawTag tag = new RawTag(nameLength, sizeLength, sizeMultiplier,
+ flagLength, data, offset);
+ offset += tag.getSize();
+ return tag;
+ }
+
+ public void remove() {
+ }
+
+ }
+
+ protected static class RawTag {
+ private int headerSize;
+ protected String name;
+ protected int flag;
+ protected byte[] data;
+
+ private RawTag(
+ int nameLength, int sizeLength, int sizeMultiplier,
+ int flagLength, byte[] frameData, int offset) {
+ headerSize = nameLength + sizeLength + flagLength;
+
+ // Name, normally 3 or 4 bytes
+ name = getString(frameData, offset, nameLength);
+
+ // Size
+ int rawSize;
+ if (sizeLength == 3) {
+ rawSize = getInt3(frameData, offset+nameLength);
+ } else {
+ rawSize = getInt(frameData, offset+nameLength);
+ }
+ int size = rawSize * sizeMultiplier;
+
+ // Flag
+ if (flagLength > 0) {
+ if (flagLength == 1) {
+ flag = (int)frameData[offset+nameLength+sizeLength];
+ } else {
+ flag = getInt2(frameData, offset+nameLength+sizeLength);
+ }
+ }
+
+ // Now data
+ data = new byte[size];
+ System.arraycopy(frameData,
+ offset+nameLength+sizeLength+flagLength, data, 0, size);
+ }
+
+ protected int getSize() {
+ return headerSize + data.length;
+ }
+
+ }
+
+}
Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java?rev=903176&r1=903175&r2=903176&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java Tue Jan 26 11:27:33 2010
@@ -18,7 +18,8 @@
import java.io.IOException;
import java.io.InputStream;
-import java.io.UnsupportedEncodingException;
+import java.util.ArrayList;
+import java.util.List;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
@@ -33,144 +34,11 @@
* from an MP3 file, if available.
*
* @see <a href="http://www.id3.org/ID3v1">MP3 ID3 Version 1 specification</a>
+ * @see <a href="http://www.id3.org/id3v2.4.0-structure">MP3 ID3 Version 2.4 Structure Specification</a>
+ * @see <a href="http://www.id3.org/id3v2.4.0-frames">MP3 ID3 Version 2.4 Frames Specification</a>
*/
public class Mp3Parser implements Parser {
- /**
- * List of predefined genres.
- *
- * @see http://www.id3.org/id3v2-00
- */
- private static final String[] GENRES = new String[] {
- /* 0 */ "Blues",
- /* 1 */ "Classic Rock",
- /* 2 */ "Country",
- /* 3 */ "Dance",
- /* 4 */ "Disco",
- /* 5 */ "Funk",
- /* 6 */ "Grunge",
- /* 7 */ "Hip-Hop",
- /* 8 */ "Jazz",
- /* 9 */ "Metal",
- /* 10 */ "New Age",
- /* 11 */ "Oldies",
- /* 12 */ "Other",
- /* 13 */ "Pop",
- /* 14 */ "R&B",
- /* 15 */ "Rap",
- /* 16 */ "Reggae",
- /* 17 */ "Rock",
- /* 18 */ "Techno",
- /* 19 */ "Industrial",
- /* 20 */ "Alternative",
- /* 21 */ "Ska",
- /* 22 */ "Death Metal",
- /* 23 */ "Pranks",
- /* 24 */ "Soundtrack",
- /* 25 */ "Euro-Techno",
- /* 26 */ "Ambient",
- /* 27 */ "Trip-Hop",
- /* 28 */ "Vocal",
- /* 29 */ "Jazz+Funk",
- /* 30 */ "Fusion",
- /* 31 */ "Trance",
- /* 32 */ "Classical",
- /* 33 */ "Instrumental",
- /* 34 */ "Acid",
- /* 35 */ "House",
- /* 36 */ "Game",
- /* 37 */ "Sound Clip",
- /* 38 */ "Gospel",
- /* 39 */ "Noise",
- /* 40 */ "AlternRock",
- /* 41 */ "Bass",
- /* 42 */ "Soul",
- /* 43 */ "Punk",
- /* 44 */ "Space",
- /* 45 */ "Meditative",
- /* 46 */ "Instrumental Pop",
- /* 47 */ "Instrumental Rock",
- /* 48 */ "Ethnic",
- /* 49 */ "Gothic",
- /* 50 */ "Darkwave",
- /* 51 */ "Techno-Industrial",
- /* 52 */ "Electronic",
- /* 53 */ "Pop-Folk",
- /* 54 */ "Eurodance",
- /* 55 */ "Dream",
- /* 56 */ "Southern Rock",
- /* 57 */ "Comedy",
- /* 58 */ "Cult",
- /* 59 */ "Gangsta",
- /* 60 */ "Top 40",
- /* 61 */ "Christian Rap",
- /* 62 */ "Pop/Funk",
- /* 63 */ "Jungle",
- /* 64 */ "Native American",
- /* 65 */ "Cabaret",
- /* 66 */ "New Wave",
- /* 67 */ "Psychadelic",
- /* 68 */ "Rave",
- /* 69 */ "Showtunes",
- /* 70 */ "Trailer",
- /* 71 */ "Lo-Fi",
- /* 72 */ "Tribal",
- /* 73 */ "Acid Punk",
- /* 74 */ "Acid Jazz",
- /* 75 */ "Polka",
- /* 76 */ "Retro",
- /* 77 */ "Musical",
- /* 78 */ "Rock & Roll",
- /* 79 */ "Hard Rock",
- /* 80 */ "Folk",
- /* 81 */ "Folk-Rock",
- /* 82 */ "National Folk",
- /* 83 */ "Swing",
- /* 84 */ "Fast Fusion",
- /* 85 */ "Bebob",
- /* 86 */ "Latin",
- /* 87 */ "Revival",
- /* 88 */ "Celtic",
- /* 89 */ "Bluegrass",
- /* 90 */ "Avantgarde",
- /* 91 */ "Gothic Rock",
- /* 92 */ "Progressive Rock",
- /* 93 */ "Psychedelic Rock",
- /* 94 */ "Symphonic Rock",
- /* 95 */ "Slow Rock",
- /* 96 */ "Big Band",
- /* 97 */ "Chorus",
- /* 98 */ "Easy Listening",
- /* 99 */ "Acoustic",
- /* 100 */ "Humour",
- /* 101 */ "Speech",
- /* 102 */ "Chanson",
- /* 103 */ "Opera",
- /* 104 */ "Chamber Music",
- /* 105 */ "Sonata",
- /* 106 */ "Symphony",
- /* 107 */ "Booty Bass",
- /* 108 */ "Primus",
- /* 109 */ "Porn Groove",
- /* 110 */ "Satire",
- /* 111 */ "Slow Jam",
- /* 112 */ "Club",
- /* 113 */ "Tango",
- /* 114 */ "Samba",
- /* 115 */ "Folklore",
- /* 116 */ "Ballad",
- /* 117 */ "Power Ballad",
- /* 118 */ "Rhythmic Soul",
- /* 119 */ "Freestyle",
- /* 120 */ "Duet",
- /* 121 */ "Punk Rock",
- /* 122 */ "Drum Solo",
- /* 123 */ "A capella",
- /* 124 */ "Euro-House",
- /* 125 */ "Dance Hall",
- /* sentinel */ ""
- };
-
public void parse(
InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context)
@@ -179,34 +47,28 @@
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
-
- byte[] tag = getSuffix(stream, 128);
- if (tag.length == 128
- && tag[0] == 'T' && tag[1] == 'A' && tag[2] == 'G') {
- String title = getString(tag, 3, 33);
- String artist = getString(tag, 33, 63);
- String album = getString(tag, 63, 93);
- String year = getString(tag, 93, 97);
- String comment = getString(tag, 97, 127);
- int genre = (int) tag[127] & 0xff; // unsigned byte
-
- metadata.set(Metadata.TITLE, title);
- metadata.set(Metadata.AUTHOR, artist);
-
- xhtml.element("h1", title);
- xhtml.element("p", artist);
+
+ // Create handlers for the various kinds of ID3 tags
+ ID3Tags[] tags = getAllTagHandlers(stream, handler);
+
+ if (tags.length > 0) {
+ CompositeTagHandler tag = new CompositeTagHandler(tags);
+
+ metadata.set(Metadata.TITLE, tag.getTitle());
+ metadata.set(Metadata.AUTHOR, tag.getArtist());
+
+ xhtml.element("h1", tag.getTitle());
+ xhtml.element("p", tag.getArtist());
+
// ID3v1.1 Track addition
- // If the last two bytes of the comment field are zero and
- // non-zero, then the last byte is the track number
- if (tag[125] == 0 && tag[126] != 0) {
- int track = (int) tag[126] & 0xff;
- xhtml.element("p", album + ", track " + track);
+ if (tag.getTrackNumber() != null) {
+ xhtml.element("p", tag.getAlbum() + ", track " + tag.getTrackNumber());
} else {
- xhtml.element("p", album);
+ xhtml.element("p", tag.getAlbum());
}
- xhtml.element("p", year);
- xhtml.element("p", comment);
- xhtml.element("p", GENRES[Math.min(genre, GENRES.length - 1)]);
+ xhtml.element("p", tag.getYear());
+ xhtml.element("p", tag.getComment());
+ xhtml.element("p", tag.getGenre());
}
xhtml.endDocument();
@@ -222,75 +84,51 @@
}
/**
- * Returns the identified ISO-8859-1 substring from the given byte buffer.
- * The return value is the zero-terminated substring retrieved from
- * between the given start and end positions in the given byte buffer.
- * Extra whitespace (and control characters) from the beginning and the
- * end of the substring is removed.
- *
- * @param buffer byte buffer
- * @param start start index of the substring
- * @param end end index of the substring
- * @return the identified substring
- * @throws TikaException if the ISO-8859-1 encoding is not available
+ * Scans the MP3 frames for ID3 tags, and creates ID3Tag Handlers
+ * for each supported set of tags.
*/
- private static String getString(byte[] buffer, int start, int end)
- throws TikaException {
- // Find the zero byte that marks the end of the string
- int zero = start;
- while (zero < end && buffer[zero] != 0) {
- zero++;
- }
-
- // Skip trailing whitespace
- end = zero;
- while (start < end && buffer[end - 1] <= ' ') {
- end--;
- }
-
- // Skip leading whitespace
- while (start < end && buffer[start] <= ' ') {
- start++;
- }
+ protected ID3Tags[] getAllTagHandlers(InputStream stream, ContentHandler handler)
+ throws IOException, SAXException, TikaException {
+ ID3v24Handler v24 = null;
+ ID3v23Handler v23 = null;
+ ID3v22Handler v22 = null;
+ ID3v1Handler v1 = null;
+
+ // ID3v2 tags live at the start of the file
+ // You can apparently have several different ID3 tag blocks
+ // So, keep going until we don't find any more
+ ID3v2Frame f;
+ while ((f = ID3v2Frame.createFrameIfPresent(stream)) != null) {
+ if (f.getMajorVersion() == 4) {
+ v24 = new ID3v24Handler(f);
+ } else if(f.getMajorVersion() == 3) {
+ v23 = new ID3v23Handler(f);
+ } else if(f.getMajorVersion() == 2) {
+ v22 = new ID3v22Handler(f);
+ }
+ }
- // Return the remaining substring
- try {
- return new String(buffer, start, end - start, "ISO-8859-1");
- } catch (UnsupportedEncodingException e) {
- throw new TikaException("ISO-8859-1 encoding is not available", e);
- }
- }
+ // ID3v1 tags live at the end of the file
+ // Just let the handler run until it's finished
+ v1 = new ID3v1Handler(stream, handler);
+
+ // Go in order of preference
+ // Currently, that's newest to oldest
+ List<ID3Tags> tags = new ArrayList<ID3Tags>();
- /**
- * Reads and returns the last <code>length</code> bytes from the
- * given stream.
- * @param stream input stream
- * @param length number of bytes from the end to read and return
- * @return stream the <code>InputStream</code> to read from.
- * @throws IOException if the stream could not be read from.
- */
- private static byte[] getSuffix(InputStream stream, int length)
- throws IOException {
- byte[] buffer = new byte[2 * length];
- int bytesInBuffer = 0;
-
- int n = stream.read(buffer);
- while (n != -1) {
- bytesInBuffer += n;
- if (bytesInBuffer == buffer.length) {
- System.arraycopy(buffer, bytesInBuffer - length, buffer, 0, length);
- bytesInBuffer = length;
- }
- n = stream.read(buffer, bytesInBuffer, buffer.length - bytesInBuffer);
+ if(v24 != null && v24.getTagsPresent()) {
+ tags.add(v24);
}
-
- if (bytesInBuffer < length) {
- length = bytesInBuffer;
+ if(v23 != null && v23.getTagsPresent()) {
+ tags.add(v23);
}
-
- byte[] result = new byte[length];
- System.arraycopy(buffer, bytesInBuffer - length, result, 0, length);
- return result;
- }
+ if(v22 != null && v22.getTagsPresent()) {
+ tags.add(v22);
+ }
+ if(v1 != null && v1.getTagsPresent()) {
+ tags.add(v1);
+ }
+ return tags.toArray(new ID3Tags[tags.size()]);
+ }
}
Modified: lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/TestParsers.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/TestParsers.java?rev=903176&r1=903175&r2=903176&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/TestParsers.java (original)
+++ lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/TestParsers.java Tue Jan 26 11:27:33 2010
@@ -161,7 +161,7 @@
}
public void testMP3Extraction() throws Exception {
- File file = getResourceAsFile("/test-documents/testMP3.mp3");
+ File file = getResourceAsFile("/test-documents/testMP3id3v1.mp3");
String s1 = ParseUtils.getStringContent(file, tc);
String s2 = ParseUtils.getStringContent(file, tc, "audio/mpeg");
assertEquals(s1, s2);
Modified: lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java?rev=903176&r1=903175&r2=903176&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java (original)
+++ lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java Tue Jan 26 11:27:33 2010
@@ -16,6 +16,7 @@
*/
package org.apache.tika.parser.mp3;
+import java.io.ByteArrayInputStream;
import java.io.InputStream;
import junit.framework.TestCase;
@@ -31,13 +32,16 @@
*/
public class Mp3ParserTest extends TestCase {
- public void testMp3Parsing() throws Exception {
+ /**
+ * Test that with only ID3v1 tags, we get some information out
+ */
+ public void testMp3ParsingID3v1() throws Exception {
Parser parser = new AutoDetectParser(); // Should auto-detect!
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
InputStream stream = Mp3ParserTest.class.getResourceAsStream(
- "/test-documents/testMP3.mp3");
+ "/test-documents/testMP3id3v1.mp3");
try {
parser.parse(stream, handler, metadata);
} finally {
@@ -57,4 +61,80 @@
assertTrue(content.contains("Rock"));
}
+ /**
+ * Test that with only ID3v2 tags, we get the full
+ * set of information out.
+ */
+ public void testMp3ParsingID3v2() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ InputStream stream = Mp3ParserTest.class.getResourceAsStream(
+ "/test-documents/testMP3id3v2.mp3");
+ try {
+ parser.parse(stream, handler, metadata);
+ } finally {
+ stream.close();
+ }
+
+ assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("Test Title", metadata.get(Metadata.TITLE));
+ assertEquals("Test Artist", metadata.get(Metadata.AUTHOR));
+
+ String content = handler.toString();
+ assertTrue(content.contains("Test Title"));
+ assertTrue(content.contains("Test Artist"));
+ assertTrue(content.contains("Test Album"));
+ assertTrue(content.contains("2008"));
+ assertTrue(content.contains("Test Comment"));
+ assertTrue(content.contains("Rock"));
+ }
+
+ /**
+ * Test that with both id3v2 and id3v1, we prefer the
+ * details from id3v2
+ */
+ public void testMp3ParsingID3v1v2() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ InputStream stream = Mp3ParserTest.class.getResourceAsStream(
+ "/test-documents/testMP3id3v1_v2.mp3");
+ try {
+ parser.parse(stream, handler, metadata);
+ } finally {
+ stream.close();
+ }
+
+ assertEquals("audio/mpeg", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("Test Title", metadata.get(Metadata.TITLE));
+ assertEquals("Test Artist", metadata.get(Metadata.AUTHOR));
+
+ String content = handler.toString();
+ assertTrue(content.contains("Test Title"));
+ assertTrue(content.contains("Test Artist"));
+ assertTrue(content.contains("Test Album"));
+ assertTrue(content.contains("2008"));
+ assertTrue(content.contains("Test Comment"));
+ assertTrue(content.contains("Rock"));
+ }
+
+ public void testID3v2Frame() throws Exception {
+ byte[] empty = new byte[] {
+ 0x49, 0x44, 0x33, 3, 1, 0,
+ 0, 0, 0, 0
+ };
+
+ assertEquals(11, ID3v2Frame.getInt(new byte[] {0,0,0,0x0b}));
+ assertEquals(257, ID3v2Frame.getInt(new byte[] {0,0,1,1}));
+
+ ID3v2Frame f = ID3v2Frame.createFrameIfPresent(new ByteArrayInputStream(empty));
+ assertEquals(3, f.getMajorVersion());
+ assertEquals(1, f.getMinorVersion());
+ assertEquals(0, f.getFlags());
+ assertEquals(0, f.getLength());
+ assertEquals(0, f.getData().length);
+ }
}
Added: lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testMP3id3v1_v2.mp3
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testMP3id3v1_v2.mp3?rev=903176&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testMP3id3v1_v2.mp3
------------------------------------------------------------------------------
svn:executable = *
Propchange: lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testMP3id3v1_v2.mp3
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testMP3id3v2.mp3
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testMP3id3v2.mp3?rev=903176&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testMP3id3v2.mp3
------------------------------------------------------------------------------
svn:executable = *
Propchange: lucene/tika/trunk/tika-parsers/src/test/resources/test-documents/testMP3id3v2.mp3
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream