You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2010/01/26 18:36:36 UTC

svn commit: r903334 - in /lucene/tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/mp3/ test/java/org/apache/tika/parser/mp3/

Author: jukka
Date: Tue Jan 26 17:36:35 2010
New Revision: 903334

URL: http://svn.apache.org/viewvc?rev=903334&view=rev
Log:
TIKA-372: Channel and SampleRate information for MP3 files

Patch by Nick Burch

Added:
    lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/AudioFrame.java
    lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/MP3Frame.java
Modified:
    lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java
    lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java
    lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java
    lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java

Added: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/AudioFrame.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/AudioFrame.java?rev=903334&view=auto
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/AudioFrame.java (added)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/AudioFrame.java Tue Jan 26 17:36:35 2010
@@ -0,0 +1,137 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.exception.TikaException;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * An Audio Frame in an MP3 file. These come after the ID3v2 tags in the file.
+ * Currently, only the header is processed, not the raw audio data.
+ */
+public class AudioFrame implements MP3Frame {
+    private String version;
+    private int sampleRate;
+    private int channels;
+
+    public String getVersion() {
+        return version;
+    }
+
+    /**
+     * Get the sampling rate, in Hz
+     */
+    public int getSampleRate() {
+        return sampleRate;
+    }
+
+    /**
+     * Get the number of channels (1=mono, 2=stereo)
+     */
+    public int getChannels() {
+        return channels;
+    }
+
+    /**
+     * Does this appear to be a 4 byte audio frame header?
+     */
+    public static boolean isAudioHeader(int h1, int h2, int h3, int h4) {
+        if (h1 == -1 || h2 == -1 || h3 == -1 || h4 == -1) {
+            return false;
+        }
+        // Check for the magic 11 bits set at the start
+        // Note - doesn't do a CRC check
+        if (h1 == 0xff && (h2 & 0x60) == 0x60) {
+            return true;
+        }
+        return false;
+    }
+
+
+    public AudioFrame(InputStream stream, ContentHandler handler)
+            throws IOException, SAXException, TikaException {
+        this(-2, -2, -2, -2, stream);
+    }
+
+    public AudioFrame(int h1, int h2, int h3, int h4, InputStream in)
+            throws IOException {
+        if (h1 == -2 && h2 == -2 && h3 == -2 && h4 == -2) {
+            h1 = in.read();
+            h2 = in.read();
+            h3 = in.read();
+            h4 = in.read();
+        }
+
+        if (isAudioHeader(h1, h2, h3, h4)) {
+            version = "MPEG 3 Layer ";
+            int layer = (h2 >> 1) & 0x03;
+            if (layer == 1) {
+                version += "III";
+            } else if (layer == 2) {
+                version += "II";
+            } else if (layer == 3) {
+                version += "I";
+            } else {
+                version += "(reserved)";
+            }
+
+            version += " Version ";
+            int ver = (h2 >> 3) & 0x03;
+            if (ver == 0) {
+                version += "2.5";
+            } else if(ver == 2) {
+                version += "2";
+            } else if(ver == 3) {
+                version += "1";
+            } else {
+                version += "(reseved)";
+            }
+
+            int rate = (h3 >> 2) & 0x03;
+            switch (rate) {
+            case 0:
+                sampleRate = 11025;
+                break;
+            case 1:
+                sampleRate = 12000;
+                break;
+            default:
+                sampleRate = 8000;
+            }
+            if (ver == 2) {
+                sampleRate *= 2;
+            } else if(ver == 3) {
+                sampleRate *= 4;
+            }
+
+            int chans = h4 & 0x03;
+            if (chans < 3) {
+                // Stereo, joint stereo, dual channel
+                channels = 2;
+            } else {
+                channels = 1;
+            }
+        } else {
+            throw new IllegalArgumentException("Magic Audio Frame Header not found");
+        }
+    }
+
+}

Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java?rev=903334&r1=903333&r2=903334&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java Tue Jan 26 17:36:35 2010
@@ -43,25 +43,33 @@
 
     public ID3v1Handler(InputStream stream, ContentHandler handler)
             throws IOException, SAXException, TikaException {
-        byte[] tag = getSuffix(stream, 128);
-        if (tag.length == 128
-                && tag[0] == 'T' && tag[1] == 'A' && tag[2] == 'G') {
+        this(getSuffix(stream, 128));
+    }
+
+    /**
+     * Creates from the last 128 bytes of a stream.
+     * @param tagData Must be the last 128 bytes 
+     */
+    protected ID3v1Handler(byte[] tagData)
+            throws IOException, SAXException, TikaException {
+        if (tagData.length == 128
+                && tagData[0] == 'T' && tagData[1] == 'A' && tagData[2] == 'G') {
             found = true;
 
-            title = getString(tag, 3, 33);
-            artist = getString(tag, 33, 63);
-            album = getString(tag, 63, 93);
-            year = getString(tag, 93, 97);
-            comment = getString(tag, 97, 127);
+            title = getString(tagData, 3, 33);
+            artist = getString(tagData, 33, 63);
+            album = getString(tagData, 63, 93);
+            year = getString(tagData, 93, 97);
+            comment = getString(tagData, 97, 127);
 
-            int genreID = (int) tag[127] & 0xff; // unsigned byte
+            int genreID = (int) tagData[127] & 0xff; // unsigned byte
             genre = GENRES[Math.min(genreID, GENRES.length - 1)];
 
             // ID3v1.1 Track addition
             // If the last two bytes of the comment field are zero and
             // non-zero, then the last byte is the track number
-            if (tag[125] == 0 && tag[126] != 0) {
-                int trackNum = (int) tag[126] & 0xff;
+            if (tagData[125] == 0 && tagData[126] != 0) {
+                int trackNum = (int) tagData[126] & 0xff;
                 trackNumber = Integer.toString(trackNum);
             }
         }
@@ -100,7 +108,6 @@
         return trackNumber;
     }
 
-
     /**
      * Returns the identified ISO-8859-1 substring from the given byte buffer.
      * The return value is the zero-terminated substring retrieved from

Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java?rev=903334&r1=903333&r2=903334&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java Tue Jan 26 17:36:35 2010
@@ -25,7 +25,7 @@
  * A frame of ID3v2 data, which is then passed to a handler to 
  * be turned into useful data.
  */
-public class ID3v2Frame {
+public class ID3v2Frame implements MP3Frame {
     private int majorVersion;
     private int minorVersion;
     private int flags;
@@ -59,15 +59,19 @@
     }
 
     /**
-     * Returns a frame of ID3v2 data, or null if the
-     *  next data to be read from the InputStream 
-     *  doesn't correspond to an ID3v2 Frame
+     * Returns the next Frame (ID3v2 or Audio) in
+     *  the file, or null if the next batch of data
+     *  doesn't correspond to either an ID3v2 Frame
+     *  or an Audio Frame.
+     * ID3v2 Frames should come before all Audio ones.
      */
-    public static ID3v2Frame createFrameIfPresent(InputStream inp)
+    public static MP3Frame createFrameIfPresent(InputStream inp)
             throws IOException {
         int h1 = inp.read();
         int h2 = inp.read();
         int h3 = inp.read();
+        
+        // Is it an ID3v2 Frame? 
         if (h1 == (int)'I' && h2 == (int)'D' && h3 == (int)'3') {
             int majorVersion = inp.read();
             int minorVersion = inp.read();
@@ -76,6 +80,12 @@
             }
             return new ID3v2Frame(majorVersion, minorVersion, inp);
         }
+        
+        // Is it an Audio Frame?
+        int h4 = inp.read();
+        if (AudioFrame.isAudioHeader(h1, h2, h3, h4)) {
+            return new AudioFrame(h1, h2, h3, h4, inp);
+        }
 
         // Not a frame header
         return null;
@@ -88,7 +98,7 @@
 
         // Get the flags and the length
         flags = inp.read();
-        length = 4 * getInt(readFully(inp, 4));
+        length = get7BitsInt(readFully(inp, 4), 0);
 
         // Do we have an extended header?
         if ((flags & 0x02) == 0x02) {
@@ -125,6 +135,19 @@
         return (b0 << 8) + (b1 << 0);
     }
 
+    /**
+     * AKA a Synchsafe integer.
+     * 4 bytes hold a 28 bit number. The highest
+     *  bit in each byte is always 0 and always ignored.
+     */
+    protected static int get7BitsInt(byte[] data, int offset) {
+        int b0 = data[offset+0] & 0x7F;
+        int b1 = data[offset+1] & 0x7F;
+        int b2 = data[offset+2] & 0x7F;
+        int b3 = data[offset+3] & 0x7F;
+        return (b0 << 21) + (b1 << 14) + (b2 << 7) + (b3 << 0);
+    }
+
     protected static byte[] readFully(InputStream inp, int length)
             throws IOException {
         byte[] b = new byte[length];

Added: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/MP3Frame.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/MP3Frame.java?rev=903334&view=auto
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/MP3Frame.java (added)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/MP3Frame.java Tue Jan 26 17:36:35 2010
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+
+/**
+ * A frame in an MP3 file, such as ID3v2 Tags or some
+ *  audio.
+ */
+public interface MP3Frame {
+}

Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java?rev=903334&r1=903333&r2=903334&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mp3/Mp3Parser.java Tue Jan 26 17:36:35 2010
@@ -47,19 +47,19 @@
 
         XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
         xhtml.startDocument();
-        
+
         // Create handlers for the various kinds of ID3 tags
-        ID3Tags[] tags = getAllTagHandlers(stream, handler);
+        ID3TagsAndAudio audioAndTags = getAllTagHandlers(stream, handler);
+
+        if (audioAndTags.tags.length > 0) {
+           CompositeTagHandler tag = new CompositeTagHandler(audioAndTags.tags);
 
-        if (tags.length > 0) {
-           CompositeTagHandler tag = new CompositeTagHandler(tags);
-           
            metadata.set(Metadata.TITLE, tag.getTitle());
            metadata.set(Metadata.AUTHOR, tag.getArtist());
 
            xhtml.element("h1", tag.getTitle());
            xhtml.element("p", tag.getArtist());
-            
+
             // ID3v1.1 Track addition
             if (tag.getTrackNumber() != null) {
                 xhtml.element("p", tag.getAlbum() + ", track " + tag.getTrackNumber());
@@ -70,6 +70,11 @@
             xhtml.element("p", tag.getComment());
             xhtml.element("p", tag.getGenre());
         }
+        if (audioAndTags.audio != null) {
+            metadata.set("samplerate", String.valueOf(audioAndTags.audio.getSampleRate()));
+            metadata.set("channels", String.valueOf(audioAndTags.audio.getChannels()));
+            metadata.set("version", audioAndTags.audio.getVersion());
+        }
 
         xhtml.endDocument();
     }
@@ -87,29 +92,35 @@
      * Scans the MP3 frames for ID3 tags, and creates ID3Tag Handlers
      *  for each supported set of tags. 
      */
-    protected ID3Tags[] getAllTagHandlers(InputStream stream, ContentHandler handler)
+    protected static ID3TagsAndAudio getAllTagHandlers(InputStream stream, ContentHandler handler)
            throws IOException, SAXException, TikaException {
        ID3v24Handler v24 = null;
        ID3v23Handler v23 = null;
        ID3v22Handler v22 = null;
        ID3v1Handler v1 = null;
+       AudioFrame firstAudio = null;
 
        // ID3v2 tags live at the start of the file
        // You can apparently have several different ID3 tag blocks
        // So, keep going until we don't find any more
-       ID3v2Frame f;
-       while ((f = ID3v2Frame.createFrameIfPresent(stream)) != null) {
-          if (f.getMajorVersion() == 4) {
-             v24 = new ID3v24Handler(f);
-          } else if(f.getMajorVersion() == 3) {
-             v23 = new ID3v23Handler(f);
-          } else if(f.getMajorVersion() == 2) {
-             v22 = new ID3v22Handler(f);
-          }
+       MP3Frame f;
+       while ((f = ID3v2Frame.createFrameIfPresent(stream)) != null && firstAudio == null) {
+           if(f instanceof ID3v2Frame) {
+               ID3v2Frame id3F = (ID3v2Frame)f;
+               if (id3F.getMajorVersion() == 4) {
+                   v24 = new ID3v24Handler(id3F);
+               } else if(id3F.getMajorVersion() == 3) {
+                   v23 = new ID3v23Handler(id3F);
+               } else if(id3F.getMajorVersion() == 2) {
+                   v22 = new ID3v22Handler(id3F);
+               }
+           } else if(f instanceof AudioFrame) {
+               firstAudio = (AudioFrame)f;
+           }
        }
 
        // ID3v1 tags live at the end of the file
-       // Just let the handler run until it's finished
+       // Our handler handily seeks to the end for us
        v1 = new ID3v1Handler(stream, handler);
 
        // Go in order of preference
@@ -128,7 +139,16 @@
        if(v1 != null && v1.getTagsPresent()) {
           tags.add(v1);
        }
-       return tags.toArray(new ID3Tags[tags.size()]);
+       
+       ID3TagsAndAudio ret = new ID3TagsAndAudio();
+       ret.audio = firstAudio;
+       ret.tags = tags.toArray(new ID3Tags[tags.size()]);
+       return ret;
+    }
+
+    protected static class ID3TagsAndAudio {
+        private ID3Tags[] tags;
+        private AudioFrame audio;
     }
 
 }

Modified: lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java?rev=903334&r1=903333&r2=903334&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java (original)
+++ lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mp3/Mp3ParserTest.java Tue Jan 26 17:36:35 2010
@@ -59,6 +59,10 @@
         assertTrue(content.contains("2008"));
         assertTrue(content.contains("Test Comment"));
         assertTrue(content.contains("Rock"));
+        
+        assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
+        assertEquals("44100", metadata.get("samplerate"));
+        assertEquals("2", metadata.get("channels"));
     }
 
     /**
@@ -89,6 +93,10 @@
         assertTrue(content.contains("2008"));
         assertTrue(content.contains("Test Comment"));
         assertTrue(content.contains("Rock"));
+        
+        assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
+        assertEquals("44100", metadata.get("samplerate"));
+        assertEquals("2", metadata.get("channels"));
     }
 
     /**
@@ -119,6 +127,10 @@
         assertTrue(content.contains("2008"));
         assertTrue(content.contains("Test Comment"));
         assertTrue(content.contains("Rock"));
+        
+        assertEquals("MPEG 3 Layer III Version 1", metadata.get("version"));
+        assertEquals("44100", metadata.get("samplerate"));
+        assertEquals("2", metadata.get("channels"));
     }
 
     public void testID3v2Frame() throws Exception {
@@ -130,7 +142,8 @@
        assertEquals(11, ID3v2Frame.getInt(new byte[] {0,0,0,0x0b}));
        assertEquals(257, ID3v2Frame.getInt(new byte[] {0,0,1,1}));
        
-       ID3v2Frame f = ID3v2Frame.createFrameIfPresent(new ByteArrayInputStream(empty));
+       ID3v2Frame f = (ID3v2Frame)
+            ID3v2Frame.createFrameIfPresent(new ByteArrayInputStream(empty));
        assertEquals(3, f.getMajorVersion());
        assertEquals(1, f.getMinorVersion());
        assertEquals(0, f.getFlags());