You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2010/01/26 18:36:36 UTC
svn commit: r903334 - in /lucene/tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/mp3/ test/java/org/apache/tika/parser/mp3/
Author: jukka
Date: Tue Jan 26 17:36:35 2010
New Revision: 903334
URL: http://svn.apache.org/viewvc?rev=903334&view=rev
Log:
TIKA-372: Channel and SampleRate information for MP3 files
Patch by Nick Burch
Added:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/AudioFrame.java
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/MP3Frame.java
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java
lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java
Added: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/AudioFrame.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/AudioFrame.java?rev=903334&view=auto
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/AudioFrame.java (added)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/AudioFrame.java Tue Jan 26 17:36:35 2010
@@ -0,0 +1,137 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.exception.TikaException;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * An Audio Frame in an MP3 file. These come after the ID3v2 tags in the file.
+ * Currently, only the header is processed, not the raw audio data.
+ */
+public class AudioFrame implements MP3Frame {
+ private String version;
+ private int sampleRate;
+ private int channels;
+
+ public String getVersion() {
+ return version;
+ }
+
+ /**
+ * Get the sampling rate, in Hz
+ */
+ public int getSampleRate() {
+ return sampleRate;
+ }
+
+ /**
+ * Get the number of channels (1=mono, 2=stereo)
+ */
+ public int getChannels() {
+ return channels;
+ }
+
+ /**
+ * Does this appear to be a 4 byte audio frame header?
+ */
+ public static boolean isAudioHeader(int h1, int h2, int h3, int h4) {
+ if (h1 == -1 || h2 == -1 || h3 == -1 || h4 == -1) {
+ return false;
+ }
+ // Check for the magic 11 bits set at the start
+ // Note - doesn't do a CRC check
+ if (h1 == 0xff && (h2 & 0x60) == 0x60) {
+ return true;
+ }
+ return false;
+ }
+
+
+ public AudioFrame(InputStream stream, ContentHandler handler)
+ throws IOException, SAXException, TikaException {
+ this(-2, -2, -2, -2, stream);
+ }
+
+ public AudioFrame(int h1, int h2, int h3, int h4, InputStream in)
+ throws IOException {
+ if (h1 == -2 && h2 == -2 && h3 == -2 && h4 == -2) {
+ h1 = in.read();
+ h2 = in.read();
+ h3 = in.read();
+ h4 = in.read();
+ }
+
+ if (isAudioHeader(h1, h2, h3, h4)) {
+ version = "MPEG 3 Layer ";
+ int layer = (h2 >> 1) & 0x03;
+ if (layer == 1) {
+ version += "III";
+ } else if (layer == 2) {
+ version += "II";
+ } else if (layer == 3) {
+ version += "I";
+ } else {
+ version += "(reserved)";
+ }
+
+ version += " Version ";
+ int ver = (h2 >> 3) & 0x03;
+ if (ver == 0) {
+ version += "2.5";
+ } else if(ver == 2) {
+ version += "2";
+ } else if(ver == 3) {
+ version += "1";
+ } else {
+ version += "(reseved)";
+ }
+
+ int rate = (h3 >> 2) & 0x03;
+ switch (rate) {
+ case 0:
+ sampleRate = 11025;
+ break;
+ case 1:
+ sampleRate = 12000;
+ break;
+ default:
+ sampleRate = 8000;
+ }
+ if (ver == 2) {
+ sampleRate *= 2;
+ } else if(ver == 3) {
+ sampleRate *= 4;
+ }
+
+ int chans = h4 & 0x03;
+ if (chans < 3) {
+ // Stereo, joint stereo, dual channel
+ channels = 2;
+ } else {
+ channels = 1;
+ }
+ } else {
+ throw new IllegalArgumentException("Magic Audio Frame Header not found");
+ }
+ }
+
+}
Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java?rev=903334&r1=903333&r2=903334&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java Tue Jan 26 17:36:35 2010
@@ -43,25 +43,33 @@
public ID3v1Handler(InputStream stream, ContentHandler handler)
throws IOException, SAXException, TikaException {
- byte[] tag = getSuffix(stream, 128);
- if (tag.length == 128
- && tag[0] == 'T' && tag[1] == 'A' && tag[2] == 'G') {
+ this(getSuffix(stream, 128));
+ }
+
+ /**
+ * Creates from the last 128 bytes of a stream.
+ * @param tagData Must be the last 128 bytes
+ */
+ protected ID3v1Handler(byte[] tagData)
+ throws IOException, SAXException, TikaException {
+ if (tagData.length == 128
+ && tagData[0] == 'T' && tagData[1] == 'A' && tagData[2] == 'G') {
found = true;
- title = getString(tag, 3, 33);
- artist = getString(tag, 33, 63);
- album = getString(tag, 63, 93);
- year = getString(tag, 93, 97);
- comment = getString(tag, 97, 127);
+ title = getString(tagData, 3, 33);
+ artist = getString(tagData, 33, 63);
+ album = getString(tagData, 63, 93);
+ year = getString(tagData, 93, 97);
+ comment = getString(tagData, 97, 127);
- int genreID = (int) tag[127] & 0xff; // unsigned byte
+ int genreID = (int) tagData[127] & 0xff; // unsigned byte
genre = GENRES[Math.min(genreID, GENRES.length - 1)];
// ID3v1.1 Track addition
// If the last two bytes of the comment field are zero and
// non-zero, then the last byte is the track number
- if (tag[125] == 0 && tag[126] != 0) {
- int trackNum = (int) tag[126] & 0xff;
+ if (tagData[125] == 0 && tagData[126] != 0) {
+ int trackNum = (int) tagData[126] & 0xff;
trackNumber = Integer.toString(trackNum);
}
}
@@ -100,7 +108,6 @@
return trackNumber;
}
-
/**
* Returns the identified ISO-8859-1 substring from the given byte buffer.
* The return value is the zero-terminated substring retrieved from
Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java?rev=903334&r1=903333&r2=903334&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java Tue Jan 26 17:36:35 2010
@@ -25,7 +25,7 @@
* A frame of ID3v2 data, which is then passed to a handler to
* be turned into useful data.
*/
-public class ID3v2Frame {
+public class ID3v2Frame implements MP3Frame {
private int majorVersion;
private int minorVersion;
private int flags;
@@ -59,15 +59,19 @@
}
/**
- * Returns a frame of ID3v2 data, or null if the
- * next data to be read from the InputStream
- * doesn't correspond to an ID3v2 Frame
+ * Returns the next Frame (ID3v2 or Audio) in
+ * the file, or null if the next batch of data
+ * doesn't correspond to either an ID3v2 Frame
+ * or an Audio Frame.
+ * ID3v2 Frames should come before all Audio ones.
*/
- public static ID3v2Frame createFrameIfPresent(InputStream inp)
+ public static MP3Frame createFrameIfPresent(InputStream inp)
throws IOException {
int h1 = inp.read();
int h2 = inp.read();
int h3 = inp.read();
+
+ // Is it an ID3v2 Frame?
if (h1 == (int)'I' && h2 == (int)'D' && h3 == (int)'3') {
int majorVersion = inp.read();
int minorVersion = inp.read();
@@ -76,6 +80,12 @@
}
return new ID3v2Frame(majorVersion, minorVersion, inp);
}
+
+ // Is it an Audio Frame?
+ int h4 = inp.read();
+ if (AudioFrame.isAudioHeader(h1, h2, h3, h4)) {
+ return new AudioFrame(h1, h2, h3, h4, inp);
+ }
// Not a frame header
return null;
@@ -88,7 +98,7 @@
// Get the flags and the length
flags = inp.read();
- length = 4 * getInt(readFully(inp, 4));
+ length = get7BitsInt(readFully(inp, 4), 0);
// Do we have an extended header?
if ((flags & 0x02) == 0x02) {
@@ -125,6 +135,19 @@
return (b0 << 8) + (b1 << 0);
}
+ /**
+ * AKA a Synchsafe integer.
+ * 4 bytes hold a 28 bit number. The highest
+ * bit in each byte is always 0 and always ignored.
+ */
+ protected static int get7BitsInt(byte[] data, int offset) {
+ int b0 = data[offset+0] & 0x7F;
+ int b1 = data[offset+1] & 0x7F;
+ int b2 = data[offset+2] & 0x7F;
+ int b3 = data[offset+3] & 0x7F;
+ return (b0 << 21) + (b1 << 14) + (b2 << 7) + (b3 << 0);
+ }
+
protected static byte[] readFully(InputStream inp, int length)
throws IOException {
byte[] b = new byte[length];
Added: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/MP3Frame.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/MP3Frame.java?rev=903334&view=auto
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/MP3Frame.java (added)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/MP3Frame.java Tue Jan 26 17:36:35 2010
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+
+/**
+ * A frame in an MP3 file, such as ID3v2 Tags or some
+ * audio.
+ */
+public interface MP3Frame {
+}
Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java?rev=903334&r1=903333&r2=903334&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java Tue Jan 26 17:36:35 2010
@@ -47,19 +47,19 @@
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
-
+
// Create handlers for the various kinds of ID3 tags
- ID3Tags[] tags = getAllTagHandlers(stream, handler);
+ ID3TagsAndAudio audioAndTags = getAllTagHandlers(stream, handler);
+
+ if (audioAndTags.tags.length > 0) {
+ CompositeTagHandler tag = new CompositeTagHandler(audioAndTags.tags);
- if (tags.length > 0) {
- CompositeTagHandler tag = new CompositeTagHandler(tags);
-
metadata.set(Metadata.TITLE, tag.getTitle());
metadata.set(Metadata.AUTHOR, tag.getArtist());
xhtml.element("h1", tag.getTitle());
xhtml.element("p", tag.getArtist());
-
+
// ID3v1.1 Track addition
if (tag.getTrackNumber() != null) {
xhtml.element("p", tag.getAlbum() + ", track " + tag.getTrackNumber());
@@ -70,6 +70,11 @@
xhtml.element("p", tag.getComment());
xhtml.element("p", tag.getGenre());
}
+ if (audioAndTags.audio != null) {
+ metadata.set("samplerate", String.valueOf(audioAndTags.audio.getSampleRate()));
+ metadata.set("channels", String.valueOf(audioAndTags.audio.getChannels()));
+ metadata.set("version", audioAndTags.audio.getVersion());
+ }
xhtml.endDocument();
}
@@ -87,29 +92,35 @@
* Scans the MP3 frames for ID3 tags, and creates ID3Tag Handlers
* for each supported set of tags.
*/
- protected ID3Tags[] getAllTagHandlers(InputStream stream, ContentHandler handler)
+ protected static ID3TagsAndAudio getAllTagHandlers(InputStream stream, ContentHandler handler)
throws IOException, SAXException, TikaException {
ID3v24Handler v24 = null;
ID3v23Handler v23 = null;
ID3v22Handler v22 = null;
ID3v1Handler v1 = null;
+ AudioFrame firstAudio = null;
// ID3v2 tags live at the start of the file
// You can apparently have several different ID3 tag blocks
// So, keep going until we don't find any more
- ID3v2Frame f;
- while ((f = ID3v2Frame.createFrameIfPresent(stream)) != null) {
- if (f.getMajorVersion() == 4) {
- v24 = new ID3v24Handler(f);
- } else if(f.getMajorVersion() == 3) {
- v23 = new ID3v23Handler(f);
- } else if(f.getMajorVersion() == 2) {
- v22 = new ID3v22Handler(f);
- }
+ MP3Frame f;
+ while ((f = ID3v2Frame.createFrameIfPresent(stream)) != null && firstAudio == null) {
+ if(f instanceof ID3v2Frame) {
+ ID3v2Frame id3F = (ID3v2Frame)f;
+ if (id3F.getMajorVersion() == 4) {
+ v24 = new ID3v24Handler(id3F);
+ } else if(id3F.getMajorVersion() == 3) {
+ v23 = new ID3v23Handler(id3F);
+ } else if(id3F.getMajorVersion() == 2) {
+ v22 = new ID3v22Handler(id3F);
+ }
+ } else if(f instanceof AudioFrame) {
+ firstAudio = (AudioFrame)f;
+ }
}
// ID3v1 tags live at the end of the file
- // Just let the handler run until it's finished
+ // Our handler handily seeks to the end for us
v1 = new ID3v1Handler(stream, handler);
// Go in order of preference
@@ -128,7 +139,16 @@
if(v1 != null && v1.getTagsPresent()) {
tags.add(v1);
}
- return tags.toArray(new ID3Tags[tags.size()]);
+
+ ID3TagsAndAudio ret = new ID3TagsAndAudio();
+ ret.audio = firstAudio;
+ ret.tags = tags.toArray(new ID3Tags[tags.size()]);
+ return ret;
+ }
+
+ protected static class ID3TagsAndAudio {
+ private ID3Tags[] tags;
+ private AudioFrame audio;
}
}
Modified: lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java?rev=903334&r1=903333&r2=903334&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java (original)
+++ lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java Tue Jan 26 17:36:35 2010
@@ -59,6 +59,10 @@
assertTrue(content.contains("2008"));
assertTrue(content.contains("Test Comment"));
assertTrue(content.contains("Rock"));
+
+ assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
+ assertEquals("44100", metadata.get("samplerate"));
+ assertEquals("2", metadata.get("channels"));
}
/**
@@ -89,6 +93,10 @@
assertTrue(content.contains("2008"));
assertTrue(content.contains("Test Comment"));
assertTrue(content.contains("Rock"));
+
+ assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
+ assertEquals("44100", metadata.get("samplerate"));
+ assertEquals("2", metadata.get("channels"));
}
/**
@@ -119,6 +127,10 @@
assertTrue(content.contains("2008"));
assertTrue(content.contains("Test Comment"));
assertTrue(content.contains("Rock"));
+
+ assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
+ assertEquals("44100", metadata.get("samplerate"));
+ assertEquals("2", metadata.get("channels"));
}
public void testID3v2Frame() throws Exception {
@@ -130,7 +142,8 @@
assertEquals(11, ID3v2Frame.getInt(new byte[] {0,0,0,0x0b}));
assertEquals(257, ID3v2Frame.getInt(new byte[] {0,0,1,1}));
- ID3v2Frame f = ID3v2Frame.createFrameIfPresent(new ByteArrayInputStream(empty));
+ ID3v2Frame f = (ID3v2Frame)
+ ID3v2Frame.createFrameIfPresent(new ByteArrayInputStream(empty));
assertEquals(3, f.getMajorVersion());
assertEquals(1, f.getMinorVersion());
assertEquals(0, f.getFlags());