You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by bo...@apache.org on 2015/12/29 00:22:47 UTC

svn commit: r1722029 [2/4] - in /tika/branches/2.x: tika-parser-modules/ tika-parser-modules/tika-multimedia-module/ tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/module/ tika-parser-modules/tika-multimedia-module/src/main/ja...

Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/TiffParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/TiffParser.java?rev=1722029&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/TiffParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/TiffParser.java Mon Dec 28 23:22:46 2015
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.image;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.image.xmp.JempboxExtractor;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class TiffParser extends AbstractParser {
+
+    /**
+     * Serial version UID
+     */
+    private static final long serialVersionUID = -3941143576535464926L;
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            Collections.singleton(MediaType.image("tiff"));
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        TemporaryResources tmp = new TemporaryResources();
+        try {
+            TikaInputStream tis = TikaInputStream.get(stream, tmp);
+            new ImageMetadataExtractor(metadata).parseTiff(tis.getFile());
+            new JempboxExtractor(metadata).parse(tis);
+        } finally {
+            tmp.dispose();
+        }
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+        xhtml.endDocument();
+    }
+
+}

Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/WebPParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/WebPParser.java?rev=1722029&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/WebPParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/WebPParser.java Mon Dec 28 23:22:46 2015
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.image;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+
+public class WebPParser extends AbstractParser {
+
+    /**
+     * Serial version UID
+     */
+    private static final long serialVersionUID = -3941143576535464926L;
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            Collections.singleton(MediaType.image("webp"));
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        TemporaryResources tmp = new TemporaryResources();
+        try {
+            TikaInputStream tis = TikaInputStream.get(stream, tmp);
+            new ImageMetadataExtractor(metadata).parseWebP(tis.getFile());
+        } finally {
+            tmp.dispose();
+        }
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+        xhtml.endDocument();
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java?rev=1722029&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java Mon Dec 28 23:22:46 2015
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.image.xmp;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.util.List;
+
+import org.apache.jempbox.xmp.XMPMetadata;
+import org.apache.jempbox.xmp.XMPSchemaDublinCore;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.xml.sax.InputSource;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+public class JempboxExtractor {
+
+    // The XMP spec says it must be unicode, but for most file formats it specifies "must be encoded in UTF-8"
+    private static final String DEFAULT_XMP_CHARSET = UTF_8.name();
+    private XMPPacketScanner scanner = new XMPPacketScanner();
+    private Metadata metadata;
+
+    public JempboxExtractor(Metadata metadata) {
+        this.metadata = metadata;
+    }
+
+    public void parse(InputStream file) throws IOException, TikaException {
+        ByteArrayOutputStream xmpraw = new ByteArrayOutputStream();
+        if (!scanner.parse(file, xmpraw)) {
+            return;
+        }
+
+        Reader decoded = new InputStreamReader(
+                new ByteArrayInputStream(xmpraw.toByteArray()),
+                DEFAULT_XMP_CHARSET);
+        try {
+            XMPMetadata xmp = XMPMetadata.load(new InputSource(decoded));
+            XMPSchemaDublinCore dc = xmp.getDublinCoreSchema();
+            if (dc != null) {
+                if (dc.getTitle() != null) {
+                    metadata.set(TikaCoreProperties.TITLE, dc.getTitle());
+                }
+                if (dc.getDescription() != null) {
+                    metadata.set(TikaCoreProperties.DESCRIPTION, dc.getDescription());
+                }
+                if (dc.getCreators() != null && dc.getCreators().size() > 0) {
+                    metadata.set(TikaCoreProperties.CREATOR, joinCreators(dc.getCreators()));
+                }
+                if (dc.getSubjects() != null && dc.getSubjects().size() > 0) {
+                    for (String keyword : dc.getSubjects()) {
+                        metadata.add(TikaCoreProperties.KEYWORDS, keyword);
+                    }
+                    // TODO should we set KEYWORDS too?
+                    // All tested photo managers set the same in Iptc.Application2.Keywords and Xmp.dc.subject
+                }
+            }
+        } catch (IOException e) {
+            // Could not parse embedded XMP metadata. That's not a serious
+            // problem, so we'll just ignore the issue for now.
+            // TODO: Make error handling like this configurable.
+        }
+    }
+
+    protected String joinCreators(List<String> creators) {
+        if (creators == null || creators.size() == 0) {
+            return "";
+        }
+        if (creators.size() == 1) {
+            return creators.get(0);
+        }
+        StringBuffer c = new StringBuffer();
+        for (String s : creators) {
+            c.append(", ").append(s);
+        }
+        return c.substring(2);
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/xmp/XMPPacketScanner.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/xmp/XMPPacketScanner.java?rev=1722029&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/xmp/XMPPacketScanner.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/xmp/XMPPacketScanner.java Mon Dec 28 23:22:46 2015
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* $Id: XMPPacketParser.java 750418 2009-03-05 11:03:54Z vhennebert $ */
+
+package org.apache.tika.parser.image.xmp;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+
+import static java.nio.charset.StandardCharsets.US_ASCII;
+
+/**
+ * This class is a parser for XMP packets. By default, it tries to locate the first XMP packet
+ * it finds and parses it.
+ * <p/>
+ * Important: Before you use this class to look for an XMP packet in some random file, please read
+ * the chapter on "Scanning Files for XMP Packets" in the XMP specification!
+ * <p/>
+ * Thic class was branched from http://xmlgraphics.apache.org/ XMPPacketParser.
+ * See also org.semanticdesktop.aperture.extractor.xmp.XMPExtractor, a variant.
+ */
+public class XMPPacketScanner {
+
+    private static final byte[] PACKET_HEADER;
+    private static final byte[] PACKET_HEADER_END;
+    private static final byte[] PACKET_TRAILER;
+
+    static {
+        PACKET_HEADER = "<?xpacket begin=".getBytes(US_ASCII);
+        PACKET_HEADER_END = "?>".getBytes(US_ASCII);
+        PACKET_TRAILER = "<?xpacket".getBytes(US_ASCII);
+    }
+
+    private static boolean skipAfter(InputStream in, byte[] match) throws IOException {
+        return skipAfter(in, match, null);
+    }
+
+    private static boolean skipAfter(InputStream in, byte[] match, OutputStream out)
+            throws IOException {
+        int found = 0;
+        int len = match.length;
+        int b;
+        while ((b = in.read()) >= 0) {
+            if (b == match[found]) {
+                found++;
+                if (found == len) {
+                    return true;
+                }
+            } else {
+                if (out != null) {
+                    if (found > 0) {
+                        out.write(match, 0, found);
+                    }
+                    out.write(b);
+                }
+                found = 0;
+            }
+        }
+        return false;
+    }
+
+    /**
+     * Locates an XMP packet in a stream, parses it and returns the XMP metadata. If no
+     * XMP packet is found until the stream ends, null is returned. Note: This method
+     * only finds the first XMP packet in a stream. And it cannot determine whether it
+     * has found the right XMP packet if there are multiple packets.
+     * <p/>
+     * Does <em>not</em> close the stream.
+     * If XMP block was found reading can continue below the block.
+     *
+     * @param in     the InputStream to search
+     * @param xmlOut to write the XMP packet to
+     * @return true if XMP packet is found, false otherwise
+     * @throws IOException          if an I/O error occurs
+     * @throws TransformerException if an error occurs while parsing the XMP packet
+     */
+    public boolean parse(InputStream in, OutputStream xmlOut) throws IOException {
+        if (!in.markSupported()) {
+            in = new java.io.BufferedInputStream(in);
+        }
+        boolean foundXMP = skipAfter(in, PACKET_HEADER);
+        if (!foundXMP) {
+            return false;
+        }
+        //TODO Inspect "begin" attribute!
+        if (!skipAfter(in, PACKET_HEADER_END)) {
+            throw new IOException("Invalid XMP packet header!");
+        }
+        //TODO Do with TeeInputStream when Commons IO 1.4 is available
+        if (!skipAfter(in, PACKET_TRAILER, xmlOut)) {
+            throw new IOException("XMP packet not properly terminated!");
+        }
+        return true;
+    }
+
+}
+

Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java?rev=1722029&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java Mon Dec 28 23:22:46 2015
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.jpeg;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.image.ImageMetadataExtractor;
+import org.apache.tika.parser.image.xmp.JempboxExtractor;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class JpegParser extends AbstractParser {
+
+    /**
+     * Serial version UID
+     */
+    private static final long serialVersionUID = -1355028253756234603L;
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+            Collections.singleton(MediaType.image("jpeg"));
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        TemporaryResources tmp = new TemporaryResources();
+        try {
+            TikaInputStream tis = TikaInputStream.get(stream, tmp);
+            new ImageMetadataExtractor(metadata).parseJpeg(tis.getFile());
+            new JempboxExtractor(metadata).parse(tis);
+        } finally {
+            tmp.dispose();
+        }
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+        xhtml.endDocument();
+    }
+
+}

Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/AudioFrame.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/AudioFrame.java?rev=1722029&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/AudioFrame.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/AudioFrame.java Mon Dec 28 23:22:46 2015
@@ -0,0 +1,252 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.exception.TikaException;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * An Audio Frame in an MP3 file. These come after the ID3v2 tags in the file.
+ * Currently, only the header is processed, not the raw audio data.
+ */
+public class AudioFrame implements MP3Frame {
+    /** Constant for the MPEG version 1. */
+    public static final int MPEG_V1 = 3;
+
+    /** Constant for the MPEG version 2. */
+    public static final int MPEG_V2 = 2;
+
+    /** Constant for the MPEG version 2.5. */
+    public static final int MPEG_V2_5 = 0;
+
+    /** Constant for audio layer 1. */
+    public static final int LAYER_1 = 3;
+    
+    /** Constant for audio layer 2. */
+    public static final int LAYER_2 = 2;
+    
+    /** Constant for audio layer 3. */
+    public static final int LAYER_3 = 1;
+    
+    private final String version;
+    private final int versionCode;
+    private final int layer;
+    private final int sampleRate;
+    private final int channels;
+    private final int bitRate;
+    private final int length;
+    private final float duration;
+
+    public String getVersion() {
+        return version;
+    }
+
+    /**
+     * Get the sampling rate, in Hz
+     */
+    public int getSampleRate() {
+        return sampleRate;
+    }
+
+    /**
+     * Get the number of channels (1=mono, 2=stereo)
+     */
+    public int getChannels() {
+        return channels;
+    }
+
+    /**
+     * Get the version code.
+     * @return the version code (one of the {@code MPEG} constants)
+     */
+    public int getVersionCode()
+    {
+        return versionCode;
+    }
+
+    /**
+     * Get the audio layer code.
+     * @return the audio layer (one of the {@code LAYER} constants)
+     */
+    public int getLayer()
+    {
+        return layer;
+    }
+
+    /**
+     * Get the bit rate in bit per second.
+     * @return the bit rate
+     */
+    public int getBitRate()
+    {
+        return bitRate;
+    }
+
+    /**
+     * Returns the frame length in bytes.
+     * @return the frame length
+     */
+    public int getLength()
+    {
+        return length;
+    }
+
+    /**
+     * Returns the duration in milliseconds.
+     * @return the duration
+     */
+    public float getDuration()
+    {
+        return duration;
+    }
+
+    /**
+     * Does this appear to be a 4 byte audio frame header?
+     */
+    public static boolean isAudioHeader(int h1, int h2, int h3, int h4) {
+        if (h1 == -1 || h2 == -1 || h3 == -1 || h4 == -1) {
+            return false;
+        }
+        // Check for the magic 11 bits set at the start
+        // Note - doesn't do a CRC check
+        if (h1 == 0xff && (h2 & 0x60) == 0x60) {
+            return true;
+        }
+        return false;
+    }
+
+    /**
+     * @deprecated Use the constructor which is passed all values directly.
+     */
+    @Deprecated
+    public AudioFrame(InputStream stream, ContentHandler handler)
+            throws IOException, SAXException, TikaException {
+        this(-2, -2, -2, -2, stream);
+    }
+
+    /**
+     * @deprecated Use the constructor which is passed all values directly.
+     */
+    @Deprecated
+    public AudioFrame(int h1, int h2, int h3, int h4, InputStream in)
+            throws IOException {
+        if (h1 == -2 && h2 == -2 && h3 == -2 && h4 == -2) {
+            h1 = in.read();
+            h2 = in.read();
+            h3 = in.read();
+            h4 = in.read();
+        }
+
+        if (isAudioHeader(h1, h2, h3, h4)) {
+            layer = (h2 >> 1) & 0x03;
+            versionCode = (h2 >> 3) & 0x03;
+            version = generateVersionStr(versionCode, layer);
+
+            int rateCode = (h3 >> 2) & 0x03;
+            int rate;
+            switch (rateCode) {
+            case 0:
+                rate = 11025;
+                break;
+            case 1:
+                rate = 12000;
+                break;
+            default:
+                rate = 8000;
+            }
+            if (versionCode == MPEG_V2) {
+                rate *= 2;
+            } else if(versionCode == MPEG_V1) {
+                rate *= 4;
+            }
+            sampleRate = rate;
+
+            int chans = h4 & 0x192;
+            if (chans < 3) {
+                // Stereo, joint stereo, dual channel
+                channels = 2;
+            } else {
+                channels = 1;
+            }
+            bitRate = 0;
+            duration = 0;
+            length = 0;
+        } else {
+            throw new IllegalArgumentException("Magic Audio Frame Header not found");
+        }
+    }
+    
+    /**
+     * 
+     * Creates a new instance of {@code AudioFrame} and initializes all properties.
+     * @param mpegVersion the code for the MPEG version
+     * @param layer the code for the layer
+     * @param bitRate the bit rate (in bps)
+     * @param sampleRate the sample rate (in samples per second)
+     * @param channels the number of channels
+     * @param length the frame length (in bytes)
+     * @param duration the duration of this frame (in milliseconds)
+     */
+    public AudioFrame(int mpegVersion, int layer, int bitRate, int sampleRate,
+            int channels, int length, float duration) {
+        versionCode = mpegVersion;
+        this.layer = layer;
+        this.bitRate = bitRate;
+        this.sampleRate = sampleRate;
+        this.channels = channels;
+        this.length = length;
+        this.duration = duration;
+        version = generateVersionStr(mpegVersion, layer);
+    }
+
+    /**
+     * Generates a string for the version of this audio frame.
+     * @param version the code for the MPEG version
+     * @param layer the code for the layer
+     * @return a string for the version
+     */
+    private static String generateVersionStr(int version, int layer) {
+        StringBuilder buf = new StringBuilder(64);
+        buf.append("MPEG 3 Layer ");
+        if (layer == LAYER_3) {
+            buf.append("III");
+        } else if (layer == LAYER_2) {
+            buf.append("II");
+        } else if (layer == LAYER_1) {
+            buf.append("I");
+        } else {
+            buf.append("(reserved)");
+        }
+
+        buf.append(" Version ");
+        if (version == MPEG_V2_5) {
+            buf.append("2.5");
+        } else if(version == MPEG_V2) {
+            buf.append("2");
+        } else if(version == MPEG_V1) {
+            buf.append("1");
+        } else {
+            buf.append("(reseved)");
+        }
+        
+        return buf.toString();
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/CompositeTagHandler.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/CompositeTagHandler.java?rev=1722029&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/CompositeTagHandler.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/CompositeTagHandler.java Mon Dec 28 23:22:46 2015
@@ -0,0 +1,142 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.util.Collections;
+import java.util.List;
+
+/**
+ * Takes an array of {@link ID3Tags} in preference order, and when asked for
+ * a given tag, will return it from the first {@link ID3Tags} that has it.
+ */
+public class CompositeTagHandler implements ID3Tags {
+
+    private ID3Tags[] tags;
+
+    public CompositeTagHandler(ID3Tags[] tags) {
+        this.tags = tags;
+    }
+
+    public boolean getTagsPresent() {
+        for (ID3Tags tag : tags) {
+            if (tag.getTagsPresent()) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    public String getTitle() {
+        for (ID3Tags tag : tags) {
+            if (tag.getTitle() != null) {
+                return tag.getTitle();
+            }
+        }
+        return null;
+    }
+
+    public String getArtist() {
+        for (ID3Tags tag : tags) {
+            if (tag.getArtist() != null) {
+                return tag.getArtist();
+            }
+        }
+        return null;
+    }
+
+    public String getAlbum() {
+        for (ID3Tags tag : tags) {
+            if (tag.getAlbum() != null) {
+                return tag.getAlbum();
+            }
+        }
+        return null;
+    }
+
+    public String getComposer() {
+        for (ID3Tags tag : tags) {
+            if (tag.getComposer() != null) {
+                return tag.getComposer();
+            }
+        }
+        return null;
+    }
+
+    public String getYear() {
+        for (ID3Tags tag : tags) {
+            if (tag.getYear() != null) {
+                return tag.getYear();
+            }
+        }
+        return null;
+    }
+
+    public List<ID3Comment> getComments() {
+        for (ID3Tags tag : tags) {
+            List<ID3Comment> comments = tag.getComments();
+            if (comments != null && comments.size() > 0) {
+                return comments;
+            }
+        }
+        return Collections.emptyList();
+    }
+
+    public String getGenre() {
+        for (ID3Tags tag : tags) {
+            if (tag.getGenre() != null) {
+                return tag.getGenre();
+            }
+        }
+        return null;
+    }
+
+    public String getTrackNumber() {
+        for (ID3Tags tag : tags) {
+            if (tag.getTrackNumber() != null) {
+                return tag.getTrackNumber();
+            }
+        }
+        return null;
+    }
+
+    public String getAlbumArtist() {
+        for (ID3Tags tag : tags) {
+            if (tag.getAlbumArtist() != null) {
+                return tag.getAlbumArtist();
+            }
+        }
+        return null;
+    }
+
+    public String getDisc() {
+        for (ID3Tags tag : tags) {
+            if (tag.getDisc() != null) {
+                return tag.getDisc();
+            }
+        }
+        return null;
+    }
+
+    public String getCompilation() {
+        for (ID3Tags tag : tags) {
+            if (tag.getCompilation() != null) {
+                return tag.getCompilation();
+            }
+        }
+        return null;
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3Tags.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3Tags.java?rev=1722029&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3Tags.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3Tags.java Mon Dec 28 23:22:46 2015
@@ -0,0 +1,254 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.util.List;
+
+/**
+ * Interface that defines the common interface for ID3 tag parsers,
+ *  such as ID3v1 and ID3v2.3.
+ * Implementations should return NULL if the file lacks a given
+ *  tag, or if the tag isn't defined for the version.
+ *  
+ * Note that so far, only the ID3v1 core tags are listed here. In
+ *  future, we may wish to add more to cover the extra tags that
+ *  our ID3v2 handlers can produce.
+ */
+public interface ID3Tags {
+    /**
+     * List of predefined genres.
+     *
+     * @see http://www.id3.org/id3v2-00
+     */
+    String[] GENRES = new String[] {
+        /*  0 */ "Blues",
+        /*  1 */ "Classic Rock",
+        /*  2 */ "Country",
+        /*  3 */ "Dance",
+        /*  4 */ "Disco",
+        /*  5 */ "Funk",
+        /*  6 */ "Grunge",
+        /*  7 */ "Hip-Hop",
+        /*  8 */ "Jazz",
+        /*  9 */ "Metal",
+        /* 10 */ "New Age",
+        /* 11 */ "Oldies",
+        /* 12 */ "Other",
+        /* 13 */ "Pop",
+        /* 14 */ "R&B",
+        /* 15 */ "Rap",
+        /* 16 */ "Reggae",
+        /* 17 */ "Rock",
+        /* 18 */ "Techno",
+        /* 19 */ "Industrial",
+        /* 20 */ "Alternative",
+        /* 21 */ "Ska",
+        /* 22 */ "Death Metal",
+        /* 23 */ "Pranks",
+        /* 24 */ "Soundtrack",
+        /* 25 */ "Euro-Techno",
+        /* 26 */ "Ambient",
+        /* 27 */ "Trip-Hop",
+        /* 28 */ "Vocal",
+        /* 29 */ "Jazz+Funk",
+        /* 30 */ "Fusion",
+        /* 31 */ "Trance",
+        /* 32 */ "Classical",
+        /* 33 */ "Instrumental",
+        /* 34 */ "Acid",
+        /* 35 */ "House",
+        /* 36 */ "Game",
+        /* 37 */ "Sound Clip",
+        /* 38 */ "Gospel",
+        /* 39 */ "Noise",
+        /* 40 */ "AlternRock",
+        /* 41 */ "Bass",
+        /* 42 */ "Soul",
+        /* 43 */ "Punk",
+        /* 44 */ "Space",
+        /* 45 */ "Meditative",
+        /* 46 */ "Instrumental Pop",
+        /* 47 */ "Instrumental Rock",
+        /* 48 */ "Ethnic",
+        /* 49 */ "Gothic",
+        /* 50 */ "Darkwave",
+        /* 51 */ "Techno-Industrial",
+        /* 52 */ "Electronic",
+        /* 53 */ "Pop-Folk",
+        /* 54 */ "Eurodance",
+        /* 55 */ "Dream",
+        /* 56 */ "Southern Rock",
+        /* 57 */ "Comedy",
+        /* 58 */ "Cult",
+        /* 59 */ "Gangsta",
+        /* 60 */ "Top 40",
+        /* 61 */ "Christian Rap",
+        /* 62 */ "Pop/Funk",
+        /* 63 */ "Jungle",
+        /* 64 */ "Native American",
+        /* 65 */ "Cabaret",
+        /* 66 */ "New Wave",
+        /* 67 */ "Psychadelic",
+        /* 68 */ "Rave",
+        /* 69 */ "Showtunes",
+        /* 70 */ "Trailer",
+        /* 71 */ "Lo-Fi",
+        /* 72 */ "Tribal",
+        /* 73 */ "Acid Punk",
+        /* 74 */ "Acid Jazz",
+        /* 75 */ "Polka",
+        /* 76 */ "Retro",
+        /* 77 */ "Musical",
+        /* 78 */ "Rock & Roll",
+        /* 79 */ "Hard Rock",
+        /* 80 */ "Folk",
+        /* 81 */ "Folk-Rock",
+        /* 82 */ "National Folk",
+        /* 83 */ "Swing",
+        /* 84 */ "Fast Fusion",
+        /* 85 */ "Bebob",
+        /* 86 */ "Latin",
+        /* 87 */ "Revival",
+        /* 88 */ "Celtic",
+        /* 89 */ "Bluegrass",
+        /* 90 */ "Avantgarde",
+        /* 91 */ "Gothic Rock",
+        /* 92 */ "Progressive Rock",
+        /* 93 */ "Psychedelic Rock",
+        /* 94 */ "Symphonic Rock",
+        /* 95 */ "Slow Rock",
+        /* 96 */ "Big Band",
+        /* 97 */ "Chorus",
+        /* 98 */ "Easy Listening",
+        /* 99 */ "Acoustic",
+        /* 100 */ "Humour",
+        /* 101 */ "Speech",
+        /* 102 */ "Chanson",
+        /* 103 */ "Opera",
+        /* 104 */ "Chamber Music",
+        /* 105 */ "Sonata",
+        /* 106 */ "Symphony",
+        /* 107 */ "Booty Bass",
+        /* 108 */ "Primus",
+        /* 109 */ "Porn Groove",
+        /* 110 */ "Satire",
+        /* 111 */ "Slow Jam",
+        /* 112 */ "Club",
+        /* 113 */ "Tango",
+        /* 114 */ "Samba",
+        /* 115 */ "Folklore",
+        /* 116 */ "Ballad",
+        /* 117 */ "Power Ballad",
+        /* 118 */ "Rhythmic Soul",
+        /* 119 */ "Freestyle",
+        /* 120 */ "Duet",
+        /* 121 */ "Punk Rock",
+        /* 122 */ "Drum Solo",
+        /* 123 */ "A capella",
+        /* 124 */ "Euro-House",
+        /* 125 */ "Dance Hall",
+        /* sentinel */ ""
+    };
+
+    /**
+     * Does the file contain this kind of tags?
+     */
+    boolean getTagsPresent();
+
+    String getTitle();
+
+    /**
+     * The Artist for the track
+     */
+    String getArtist();
+
+    /**
+     * The Artist for the overall album / compilation of albums
+     */
+    String getAlbumArtist();
+
+    String getAlbum();
+    
+    String getComposer();
+
+    String getCompilation();
+    
+    /**
+     * Retrieves the comments, if any.
+     * Files may have more than one comment, but normally only 
+     *  one with any language/description pair.
+     */
+    List<ID3Comment> getComments();
+
+    String getGenre();
+
+    String getYear();
+
+    /**
+     * The number of the track within the album / recording
+     */
+    String getTrackNumber();
+
+    /**
+     * The number of the disc this belongs to, within the set
+     */
+    String getDisc();
+
+    /**
+     * Represents a comments in ID3 (especially ID3 v2), where are 
+     *  made up of several parts
+     */
+    public static class ID3Comment {
+        private String language;
+        private String description;
+        private String text;
+        
+        /**
+         * Creates an ID3 v1 style comment tag
+         */
+        public ID3Comment(String id3v1Text) {
+           this.text = id3v1Text;
+        }
+        /**
+         * Creates an ID3 v2 style comment tag
+         */
+        public ID3Comment(String language, String description, String text) {
+            this.language = language;
+            this.description = description;
+            this.text = text;
+        }
+
+        /**
+         * Gets the language, if present
+         */
+        public String getLanguage() {
+           return language;
+        }
+        /**
+         * Gets the description, if present
+         */
+        public String getDescription() {
+           return description;
+        }
+        /**
+         * Gets the text, if present
+         */
+        public String getText() {
+           return text;
+        }
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java?rev=1722029&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java Mon Dec 28 23:22:46 2015
@@ -0,0 +1,183 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.tika.exception.TikaException;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import static java.nio.charset.StandardCharsets.ISO_8859_1;
+
+/**
+ * This is used to parse ID3 Version 1 Tag information from an MP3 file, 
+ * if available.
+ *
+ * @see <a href="http://www.id3.org/ID3v1">MP3 ID3 Version 1 specification</a>
+ */
+public class ID3v1Handler implements ID3Tags {
+    private String title;
+    private String artist;
+    private String album;
+    private String year;
+    private ID3Comment comment;
+    private String genre;
+    private String trackNumber;
+
+    boolean found = false;
+
+    public ID3v1Handler(InputStream stream, ContentHandler handler)
+            throws IOException, SAXException, TikaException {
+        this(LyricsHandler.getSuffix(stream, 128));
+    }
+
+    /**
+     * Creates from the last 128 bytes of a stream.
+     * @param tagData Must be the last 128 bytes 
+     */
+    protected ID3v1Handler(byte[] tagData)
+            throws IOException, SAXException, TikaException {
+        if (tagData.length == 128
+                && tagData[0] == 'T' && tagData[1] == 'A' && tagData[2] == 'G') {
+            found = true;
+
+            title = getString(tagData, 3, 33);
+            artist = getString(tagData, 33, 63);
+            album = getString(tagData, 63, 93);
+            year = getString(tagData, 93, 97);
+            
+            String commentStr = getString(tagData, 97, 127);
+            comment = new ID3Comment(commentStr);
+
+            int genreID = (int) tagData[127] & 0xff; // unsigned byte
+            genre = GENRES[Math.min(genreID, GENRES.length - 1)];
+
+            // ID3v1.1 Track addition
+            // If the last two bytes of the comment field are zero and
+            // non-zero, then the last byte is the track number
+            if (tagData[125] == 0 && tagData[126] != 0) {
+                int trackNum = (int) tagData[126] & 0xff;
+                trackNumber = Integer.toString(trackNum);
+            }
+        }
+    }
+
+
+    public boolean getTagsPresent() {
+        return found;
+    }
+
+    public String getTitle() {
+        return title;
+    }
+
+    public String getArtist() {
+        return artist;
+    }
+
+    public String getAlbum() {
+        return album;
+    }
+
+    public String getYear() {
+        return year;
+    }
+
+    public List<ID3Comment> getComments() {
+       return Arrays.asList(comment);
+    }
+
+    public String getGenre() {
+        return genre;
+    }
+
+    public String getTrackNumber() {
+        return trackNumber;
+    }
+    
+    /**
+     * ID3v1 doesn't have composers,
+     *  so returns null;
+     */
+    public String getComposer() {
+        return null;
+    }
+
+    /**
+     * ID3v1 doesn't have album-wide artists,
+     *  so returns null;
+     */
+    public String getAlbumArtist() {
+        return null;
+    }
+
+    /**
+     * ID3v1 doesn't have disc numbers,
+     *  so returns null;
+     */
+    public String getDisc() {
+        return null;
+    }
+
+    /**
+     * ID3v1 doesn't have compilations,
+     *  so returns null;
+     */
+    public String getCompilation() {
+        return null;
+    }
+
+    /**
+     * Returns the identified ISO-8859-1 substring from the given byte buffer.
+     * The return value is the zero-terminated substring retrieved from
+     * between the given start and end positions in the given byte buffer.
+     * Extra whitespace (and control characters) from the beginning and the
+     * end of the substring is removed.
+     *
+     * @param buffer byte buffer
+     * @param start start index of the substring
+     * @param end end index of the substring
+     * @return the identified substring
+     * @throws TikaException if the ISO-8859-1 encoding is not available
+     */
+    private static String getString(byte[] buffer, int start, int end)
+            throws TikaException {
+        // Find the zero byte that marks the end of the string
+        int zero = start;
+        while (zero < end && buffer[zero] != 0) {
+            zero++;
+        }
+
+        // Skip trailing whitespace
+        end = zero;
+        while (start < end && buffer[end - 1] <= ' ') {
+            end--;
+        }
+
+        // Skip leading whitespace
+        while (start < end && buffer[start] <= ' ') {
+            start++;
+        }
+
+        // Return the remaining substring
+        return new String(buffer, start, end - start, ISO_8859_1);
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v22Handler.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v22Handler.java?rev=1722029&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v22Handler.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v22Handler.java Mon Dec 28 23:22:46 2015
@@ -0,0 +1,159 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.mp3.ID3v2Frame.RawTag;
+import org.apache.tika.parser.mp3.ID3v2Frame.RawTagIterator;
+import org.xml.sax.SAXException;
+
+/**
+ * This is used to parse ID3 Version 2.2 Tag information from an MP3 file,
+ * if available.
+ *
+ * @see <a href="http://id3lib.sourceforge.net/id3/id3v2-00.txt">MP3 ID3 Version 2.2 specification</a>
+ */
+public class ID3v22Handler implements ID3Tags {
+    private String title;
+    private String artist;
+    private String album;
+    private String year;
+    private String composer;
+    private String genre;
+    private String trackNumber;
+    private String albumArtist;
+    private String disc;
+    private List<ID3Comment> comments = new ArrayList<ID3Comment>();
+
+    public ID3v22Handler(ID3v2Frame frame)
+            throws IOException, SAXException, TikaException {
+        RawTagIterator tags = new RawV22TagIterator(frame);
+        while (tags.hasNext()) {
+            RawTag tag = tags.next();
+            if (tag.name.equals("TT2")) {
+                title = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TP1")) {
+                artist = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TP2")) {
+                albumArtist = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TAL")) {
+                album = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TYE")) {
+                year = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TCM")) {
+                composer = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("COM")) {
+                comments.add( getComment(tag.data, 0, tag.data.length) ); 
+            } else if (tag.name.equals("TRK")) {
+                trackNumber = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TPA")) {
+                disc = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TCO")) {
+                genre = extractGenre( getTagString(tag.data, 0, tag.data.length) );
+            }
+        }
+    }
+
+    private String getTagString(byte[] data, int offset, int length) {
+        return ID3v2Frame.getTagString(data, offset, length);
+    }
+    private ID3Comment getComment(byte[] data, int offset, int length) {
+        return ID3v2Frame.getComment(data, offset, length);
+    }
+    
+    protected static String extractGenre(String rawGenre) {
+       int open = rawGenre.indexOf("(");
+       int close = rawGenre.indexOf(")");
+       if (open == -1 && close == -1) {
+          return rawGenre;
+       } else if (open < close) {
+           String genreStr = rawGenre.substring(0, open).trim();
+           try {
+               int genreID = Integer.parseInt(rawGenre.substring(open+1, close));
+               return ID3Tags.GENRES[genreID];
+           } catch(ArrayIndexOutOfBoundsException invalidNum) {
+              return genreStr;
+           } catch(NumberFormatException notANum) {
+              return genreStr;
+           }
+       } else {
+          return null;
+       }
+    }
+
+    public boolean getTagsPresent() {
+        return true;
+    }
+
+    public String getTitle() {
+        return title;
+    }
+
+    public String getArtist() {
+        return artist;
+    }
+
+    public String getAlbum() {
+        return album;
+    }
+
+    public String getYear() {
+        return year;
+    }
+    
+    public String getComposer() {
+        return composer;
+    }
+
+    public List<ID3Comment> getComments() {
+        return comments;
+    }
+
+    public String getGenre() {
+        return genre;
+    }
+
+    public String getTrackNumber() {
+        return trackNumber;
+    }
+
+    public String getAlbumArtist() {
+        return albumArtist;
+    }
+
+    public String getDisc() {
+        return disc;
+    }
+
+    /**
+     * ID3v22 doesn't have compilations,
+     *  so returns null;
+     */
+    public String getCompilation() {
+        return null;
+    }
+
+    private class RawV22TagIterator extends RawTagIterator {
+        private RawV22TagIterator(ID3v2Frame frame) {
+            frame.super(3, 3, 1, 0);
+        }
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v23Handler.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v23Handler.java?rev=1722029&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v23Handler.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v23Handler.java Mon Dec 28 23:22:46 2015
@@ -0,0 +1,138 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.mp3.ID3v2Frame.RawTag;
+import org.apache.tika.parser.mp3.ID3v2Frame.RawTagIterator;
+import org.xml.sax.SAXException;
+
+/**
+ * This is used to parse ID3 Version 2.3 Tag information from an MP3 file,
+ * if available.
+ *
+ * @see <a href="http://id3lib.sourceforge.net/id3/id3v2.3.0.html">MP3 ID3 Version 2.3 specification</a>
+ */
+public class ID3v23Handler implements ID3Tags {
+    private String title;
+    private String artist;
+    private String album;
+    private String year;
+    private String composer;
+    private String genre;
+    private String trackNumber;
+    private String albumArtist;
+    private String disc;
+    private String compilation;
+    private List<ID3Comment> comments = new ArrayList<ID3Comment>();
+
+    public ID3v23Handler(ID3v2Frame frame)
+            throws IOException, SAXException, TikaException {
+        RawTagIterator tags = new RawV23TagIterator(frame);
+        while (tags.hasNext()) {
+            RawTag tag = tags.next();
+            if (tag.name.equals("TIT2")) {
+                title = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TPE1")) {
+                artist = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TPE2")) {
+                albumArtist = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TALB")) {
+                album = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TYER")) {
+                year = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TCOM")) {
+                composer = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("COMM")) {
+                comments.add( getComment(tag.data, 0, tag.data.length) ); 
+            } else if (tag.name.equals("TRCK")) {
+                trackNumber = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TPOS")) {
+                disc = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TCMP")) {
+                compilation = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TCON")) {
+                genre = ID3v22Handler.extractGenre( getTagString(tag.data, 0, tag.data.length) );
+            }
+        }
+    }
+
+    private String getTagString(byte[] data, int offset, int length) {
+        return ID3v2Frame.getTagString(data, offset, length);
+    }
+    private ID3Comment getComment(byte[] data, int offset, int length) {
+       return ID3v2Frame.getComment(data, offset, length);
+    }
+
+    public boolean getTagsPresent() {
+        return true;
+    }
+
+    public String getTitle() {
+        return title;
+    }
+
+    public String getArtist() {
+        return artist;
+    }
+
+    public String getAlbum() {
+        return album;
+    }
+
+    public String getYear() {
+        return year;
+    }
+
+    public String getComposer() {
+        return composer;
+    }
+
+    public List<ID3Comment> getComments() {
+        return comments;
+    }
+
+    public String getGenre() {
+        return genre;
+    }
+
+    public String getTrackNumber() {
+        return trackNumber;
+    }
+
+    public String getAlbumArtist() {
+        return albumArtist;
+    }
+
+    public String getDisc() {
+        return disc;
+    }
+
+    public String getCompilation() {
+        return compilation;
+    }
+
+    private class RawV23TagIterator extends RawTagIterator {
+        private RawV23TagIterator(ID3v2Frame frame) {
+            frame.super(4, 4, 1, 2);
+        }
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v24Handler.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v24Handler.java?rev=1722029&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v24Handler.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v24Handler.java Mon Dec 28 23:22:46 2015
@@ -0,0 +1,143 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.mp3.ID3v2Frame.RawTag;
+import org.apache.tika.parser.mp3.ID3v2Frame.RawTagIterator;
+import org.xml.sax.SAXException;
+
+/**
+ * This is used to parse ID3 Version 2.4 Tag information from an MP3 file,
+ * if available.
+ *
+ * @see <a href="http://www.id3.org/id3v2.4.0-structure">MP3 ID3 Version 2.4 specification</a>
+ * @see <a href="http://www.id3.org/id3v2.4.0-frames">MP3 ID3 Version 2.4 frames/tags</a>
+ */
+public class ID3v24Handler implements ID3Tags {
+    private String title;
+    private String artist;
+    private String album;
+    private String year;
+    private String composer;
+    private String genre;
+    private String trackNumber;
+    private String albumArtist;
+    private String disc;
+    private String compilation;
+    private List<ID3Comment> comments = new ArrayList<ID3Comment>();
+
+    public ID3v24Handler(ID3v2Frame frame)
+            throws IOException, SAXException, TikaException {
+        RawTagIterator tags = new RawV24TagIterator(frame);
+        while (tags.hasNext()) {
+            RawTag tag = tags.next();
+            if (tag.name.equals("TIT2")) {
+                title = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TPE1")) {
+                artist = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TPE2")) {
+                albumArtist = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TALB")) {
+                album = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TYER")) {
+                year = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TDRC")) {
+               if(year == null) {
+                  year = getTagString(tag.data, 0, tag.data.length);
+               }
+            } else if (tag.name.equals("TCOM")) {
+                composer = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("COMM")) {
+                comments.add( getComment(tag.data, 0, tag.data.length) ); 
+            } else if (tag.name.equals("TRCK")) {
+                trackNumber = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TPOS")) {
+                disc = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TCMP")) {
+                compilation = getTagString(tag.data, 0, tag.data.length); 
+            } else if (tag.name.equals("TCON")) {
+               genre = ID3v22Handler.extractGenre( getTagString(tag.data, 0, tag.data.length) );
+            }
+        }
+    }
+
+    private String getTagString(byte[] data, int offset, int length) {
+        return ID3v2Frame.getTagString(data, offset, length);
+    }
+    private ID3Comment getComment(byte[] data, int offset, int length) {
+        return ID3v2Frame.getComment(data, offset, length);
+    }
+
+    public boolean getTagsPresent() {
+        return true;
+    }
+
+    public String getTitle() {
+        return title;
+    }
+
+    public String getArtist() {
+        return artist;
+    }
+
+    public String getAlbum() {
+        return album;
+    }
+
+    public String getYear() {
+        return year;
+    }
+
+    public String getComposer() {
+        return composer;
+    }
+
+    public List<ID3Comment> getComments() {
+        return comments;
+    }
+
+    public String getGenre() {
+        return genre;
+    }
+
+    public String getTrackNumber() {
+        return trackNumber;
+    }
+
+    public String getAlbumArtist() {
+        return albumArtist;
+    }
+
+    public String getDisc() {
+        return disc;
+    }
+
+    public String getCompilation() {
+        return compilation;
+    }
+
+    private class RawV24TagIterator extends RawTagIterator {
+        private RawV24TagIterator(ID3v2Frame frame) {
+            frame.super(4, 4, 1, 2);
+        }
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java?rev=1722029&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java Mon Dec 28 23:22:46 2015
@@ -0,0 +1,424 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.PushbackInputStream;
+import java.io.UnsupportedEncodingException;
+import java.util.Iterator;
+
+import org.apache.tika.parser.mp3.ID3Tags.ID3Comment;
+
+import static java.nio.charset.StandardCharsets.ISO_8859_1;
+
+/**
+ * A frame of ID3v2 data, which is then passed to a handler to 
+ * be turned into useful data.
+ */
+public class ID3v2Frame implements MP3Frame {
+    private int majorVersion;
+    private int minorVersion;
+    private int flags;
+    private int length;
+    /** Excludes the header size part */
+    private byte[] extendedHeader;
+    private byte[] data;
+
+    public int getMajorVersion() {
+        return majorVersion;
+    }
+
+    public int getMinorVersion() {
+        return minorVersion;
+    }
+
+    public int getFlags() {
+        return flags;
+    }
+
+    public int getLength() {
+        return length;
+    }
+
+    public byte[] getExtendedHeader() {
+        return extendedHeader;
+    }
+
+    public byte[] getData() {
+        return data;
+    }
+
+    /**
+     * Returns the next ID3v2 Frame in
+     *  the file, or null if the next batch of data
+     *  doesn't correspond to either an ID3v2 header.
+     * If no ID3v2 frame could be detected and the passed in input stream is a
+     * {@code PushbackInputStream}, the bytes read so far are pushed back so
+     * that they can be read again.
+     * ID3v2 Frames should come before all Audio ones.
+     */
+    public static MP3Frame createFrameIfPresent(InputStream inp)
+            throws IOException {
+        int h1 = inp.read();
+        int h2 = inp.read();
+        int h3 = inp.read();
+        
+        // Is it an ID3v2 Frame? 
+        if (h1 == (int)'I' && h2 == (int)'D' && h3 == (int)'3') {
+            int majorVersion = inp.read();
+            int minorVersion = inp.read();
+            if (majorVersion == -1 || minorVersion == -1) {
+                pushBack(inp, h1, h2, h3, majorVersion, minorVersion);
+                return null;
+            }
+            return new ID3v2Frame(majorVersion, minorVersion, inp);
+        }
+
+        // Not a frame header
+        pushBack(inp, h1, h2, h3);
+        return null;
+    }
+
+    /**
+     * Pushes bytes back into the stream if possible. This method is called if
+     * no ID3v2 header could be found at the current stream position.
+     * 
+     * @param inp the input stream
+     * @param bytes the bytes to be pushed back
+     * @throws IOException if an error occurs
+     */
+    private static void pushBack(InputStream inp, int... bytes)
+            throws IOException
+    {
+        if (inp instanceof PushbackInputStream)
+        {
+            byte[] buf = new byte[bytes.length];
+            for (int i = 0; i < bytes.length; i++)
+            {
+                buf[i] = (byte) bytes[i];
+            }
+            ((PushbackInputStream) inp).unread(buf);
+        }
+    }
+
+    private ID3v2Frame(int majorVersion, int minorVersion, InputStream inp)
+            throws IOException {
+        this.majorVersion = majorVersion;
+        this.minorVersion = minorVersion;
+
+        // Get the flags and the length
+        flags = inp.read();
+        length = get7BitsInt(readFully(inp, 4), 0);
+
+        // Do we have an extended header?
+        if ((flags & 0x02) == 0x02) {
+            int size = getInt(readFully(inp, 4));
+            extendedHeader = readFully(inp, size);
+        }
+
+        // Get the frame's data, or at least as much
+        //  of it as we could do
+        data = readFully(inp, length, false);
+    }
+
+    protected static int getInt(byte[] data) {
+        return getInt(data, 0);
+    }
+
+    protected static int getInt(byte[] data, int offset) {
+        int b0 = data[offset+0] & 0xFF;
+        int b1 = data[offset+1] & 0xFF;
+        int b2 = data[offset+2] & 0xFF;
+        int b3 = data[offset+3] & 0xFF;
+        return (b0 << 24) + (b1 << 16) + (b2 << 8) + (b3 << 0);
+    }
+
+    protected static int getInt3(byte[] data, int offset) {
+        int b0 = data[offset+0] & 0xFF;
+        int b1 = data[offset+1] & 0xFF;
+        int b2 = data[offset+2] & 0xFF;
+        return (b0 << 16) + (b1 << 8) + (b2 << 0);
+    }
+
+    protected static int getInt2(byte[] data, int offset) {
+        int b0 = data[offset+0] & 0xFF;
+        int b1 = data[offset+1] & 0xFF;
+        return (b0 << 8) + (b1 << 0);
+    }
+
+    /**
+     * AKA a Synchsafe integer.
+     * 4 bytes hold a 28 bit number. The highest
+     *  bit in each byte is always 0 and always ignored.
+     */
+    protected static int get7BitsInt(byte[] data, int offset) {
+        int b0 = data[offset+0] & 0x7F;
+        int b1 = data[offset+1] & 0x7F;
+        int b2 = data[offset+2] & 0x7F;
+        int b3 = data[offset+3] & 0x7F;
+        return (b0 << 21) + (b1 << 14) + (b2 << 7) + (b3 << 0);
+    }
+
+    protected static byte[] readFully(InputStream inp, int length)
+            throws IOException {
+       return readFully(inp, length, true);
+    }
+    protected static byte[] readFully(InputStream inp, int length, boolean shortDataIsFatal)
+            throws IOException {
+        byte[] b = new byte[length];
+
+        int pos = 0;
+        int read;
+        while (pos < length) {
+            read = inp.read(b, pos, length-pos);
+            if (read == -1) {
+                if(shortDataIsFatal) {
+                   throw new IOException("Tried to read " + length + " bytes, but only " + pos + " bytes present");
+                } else {
+                   // Give them what we found
+                   // TODO Log the short read
+                   return b;
+                }
+            }
+            pos += read;
+        }
+
+        return b;
+    }
+    
+    protected static class TextEncoding {
+       public final boolean doubleByte;
+       public final String encoding;
+       private TextEncoding(String encoding, boolean doubleByte) {
+          this.doubleByte = doubleByte;
+          this.encoding = encoding;
+       }
+    }
+    protected static final TextEncoding[] encodings = new TextEncoding[] {
+          new TextEncoding("ISO-8859-1", false),
+          new TextEncoding("UTF-16", true), // With BOM
+          new TextEncoding("UTF-16BE", true), // Without BOM
+          new TextEncoding("UTF-8", false)
+    };
+
+    /**
+     * Returns the (possibly null padded) String at the given offset and
+     * length. String encoding is held in the first byte; 
+     */
+    protected static String getTagString(byte[] data, int offset, int length) {
+        int actualLength = length;
+        if (actualLength == 0) {
+            return "";
+        }
+        if (actualLength == 1 && data[offset] == 0) {
+            return "";
+        }
+
+        // Does it have an encoding flag?
+        // Detect by the first byte being sub 0x20
+        TextEncoding encoding = encodings[0];
+        byte maybeEncodingFlag = data[offset];
+        if (maybeEncodingFlag >= 0 && maybeEncodingFlag < encodings.length) {
+            offset++;
+            actualLength--;
+            encoding = encodings[maybeEncodingFlag];
+        }
+        
+        // Trim off null termination / padding (as present) 
+        while (encoding.doubleByte && actualLength >= 2 && data[offset+actualLength-1] == 0 && data[offset+actualLength-2] == 0) {
+           actualLength -= 2;
+        } 
+        while (!encoding.doubleByte && actualLength >= 1 && data[offset+actualLength-1] == 0) {
+           actualLength--;
+        }
+        if (actualLength == 0) {
+           return "";
+        }
+
+        // TIKA-1024: If it's UTF-16 (with BOM) and all we
+        // have is a naked BOM then short-circuit here
+        // (return empty string), because new String(..)
+        // gives different results on different JVMs
+        if (encoding.encoding.equals("UTF-16") && actualLength == 2 &&
+            ((data[offset] == (byte) 0xff && data[offset+1] == (byte) 0xfe) ||
+             (data[offset] == (byte) 0xfe && data[offset+1] == (byte) 0xff))) {
+          return "";
+        }
+
+        try {
+            // Build the base string
+            return new String(data, offset, actualLength, encoding.encoding);
+        } catch (UnsupportedEncodingException e) {
+            throw new RuntimeException(
+                    "Core encoding " + encoding.encoding + " is not available", e);
+        }
+    }
+    /**
+     * Builds up the ID3 comment, by parsing and extracting
+     *  the comment string parts from the given data. 
+     */
+    protected static ID3Comment getComment(byte[] data, int offset, int length) {
+       // Comments must have an encoding
+       int encodingFlag = data[offset];
+       if (encodingFlag >= 0 && encodingFlag < encodings.length) {
+          // Good, valid flag
+       } else {
+          // Invalid string
+          return null;
+       }
+       
+       TextEncoding encoding = encodings[encodingFlag];
+       
+       // First is a 3 byte language
+       String lang = getString(data, offset+1, 3);
+       
+       // After that we have [Desc]\0(\0)[Text]
+       int descStart = offset+4;
+       int textStart = -1;
+       String description = null;
+       String text = null;
+       
+       // Find where the description ends
+       try {
+          for (int i=descStart; i<offset+length; i++) {
+             if (encoding.doubleByte && data[i]==0 && data[i+1] == 0) {
+                // Handle LE vs BE on low byte text
+                if (i+2 < offset+length && data[i+1] == 0 && data[i+2] == 0) {
+                   i++;
+                }
+                textStart = i+2;
+                description = new String(data, descStart, i-descStart, encoding.encoding);
+                break;
+             }
+             if (!encoding.doubleByte && data[i]==0) {
+                textStart = i+1;
+                description = new String(data, descStart, i-descStart, encoding.encoding);
+                break;
+             }
+          }
+          
+          // Did we find the end?
+          if (textStart > -1) {
+             text = new String(data, textStart, offset+length-textStart, encoding.encoding);
+          } else {
+             // Assume everything is the text
+             text = new String(data, descStart, offset+length-descStart, encoding.encoding);
+          }
+          
+          // Return
+          return new ID3Comment(lang, description, text);
+       } catch (UnsupportedEncodingException e) {
+          throw new RuntimeException(
+                  "Core encoding " + encoding.encoding + " is not available", e);
+       }
+    }
+
+    /**
+     * Returns the String at the given
+     *  offset and length. Strings are ISO-8859-1 
+     */
+    protected static String getString(byte[] data, int offset, int length) {
+        return new String(data, offset, length, ISO_8859_1);
+    }
+
+
+    /**
+     * Iterates over id3v2 raw tags.
+     * Create an instance of this that configures the
+     *  various length and multipliers.
+     */
+    protected class RawTagIterator implements Iterator<RawTag> {
+        private int nameLength;
+        private int sizeLength;
+        private int sizeMultiplier;
+        private int flagLength;
+
+        private int offset = 0;
+
+        protected RawTagIterator(
+                int nameLength, int sizeLength, int sizeMultiplier,
+                int flagLength) {
+            this.nameLength = nameLength;
+            this.sizeLength = sizeLength;
+            this.sizeMultiplier = sizeMultiplier;
+            this.flagLength = flagLength;
+        }
+
+        public boolean hasNext() {
+            // Check for padding at the end
+            return offset < data.length && data[offset] != 0;
+        }
+
+        public RawTag next() {
+            RawTag tag = new RawTag(nameLength, sizeLength, sizeMultiplier,
+                    flagLength, data, offset);
+            offset += tag.getSize();
+            return tag;
+        }
+
+        public void remove() {
+        }
+
+    }
+
+    protected static class RawTag {
+        private int headerSize;
+        protected String name;
+        protected int flag;
+        protected byte[] data;
+
+        private RawTag(
+                int nameLength, int sizeLength, int sizeMultiplier,
+                int flagLength, byte[] frameData, int offset) {
+            headerSize = nameLength + sizeLength + flagLength;
+
+            // Name, normally 3 or 4 bytes
+            name = getString(frameData, offset, nameLength);
+
+            // Size
+            int rawSize;
+            if (sizeLength == 3) {
+                rawSize = getInt3(frameData, offset+nameLength);
+            } else {
+                rawSize = getInt(frameData, offset+nameLength);
+            }
+            int size = rawSize * sizeMultiplier;
+
+            // Flag
+            if (flagLength > 0) {
+                if (flagLength == 1) {
+                    flag = (int)frameData[offset+nameLength+sizeLength];
+                } else {
+                    flag = getInt2(frameData, offset+nameLength+sizeLength);
+                }
+            }
+
+            // Now data
+            int copyFrom = offset+nameLength+sizeLength+flagLength;
+            size = Math.max(0, Math.min(size, frameData.length-copyFrom)); // TIKA-1218, prevent negative size for malformed files.
+            data = new byte[size];
+            System.arraycopy(frameData, copyFrom, data, 0, size);
+        }
+
+        protected int getSize() {
+            return headerSize + data.length;
+        }
+
+    }
+
+}

Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/LyricsHandler.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/LyricsHandler.java?rev=1722029&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/LyricsHandler.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/LyricsHandler.java Mon Dec 28 23:22:46 2015
@@ -0,0 +1,156 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.exception.TikaException;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import static java.nio.charset.StandardCharsets.US_ASCII;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+/**
+ * This is used to parse Lyrics3 tag information
+ *  from an MP3 file, if available.
+ * Handles lyrics tags of up to 10kb in size.
+ * Will process any ID3v1 tag data if present.
+ * Ignores extended ID3v1 data in the lyrics block
+ *
+ * @see <a href="http://www.id3.org/Lyrics3v2">Lyrics3 v2.0 specification</a>
+ */
+public class LyricsHandler {
+    boolean foundLyrics = false;
+    String lyricsText = null;
+    ID3v1Handler id3v1 = null;
+
+    public LyricsHandler(InputStream stream, ContentHandler handler)
+            throws IOException, SAXException, TikaException {
+        this(getSuffix(stream, 10240+128));
+    }
+
+    /**
+     * Looks for the Lyrics data, which will be
+     *  just before the ID3v1 data (if present),
+     *  and process it.
+     * Also sets things up for the ID3v1
+     *  processing if required.
+     * Creates from the last 128 bytes of a stream.
+     */
+    protected LyricsHandler(byte[] tagData)
+            throws IOException, SAXException, TikaException {
+        if(tagData.length < 128) {
+            return;
+        }
+
+        // Is there ID3v1 data?
+        byte[] last128 = new byte[128];
+        System.arraycopy(tagData, tagData.length-128, last128, 0, 128);
+        id3v1 = new ID3v1Handler(last128);
+
+        if(tagData.length < 137) {
+            return;
+        }
+
+        // Are there lyrics? Look for the closing Lyrics tag
+        //  at the end to decide if there is any
+        int lookat = tagData.length - 9;
+        if(id3v1.found) {
+            lookat -= 128;
+        }
+        if(tagData[lookat+0] == 'L' && tagData[lookat+1] == 'Y' && 
+                tagData[lookat+2] == 'R' && tagData[lookat+3] == 'I' &&
+                tagData[lookat+4] == 'C' && tagData[lookat+5] == 'S' &&
+                tagData[lookat+6] == '2' && tagData[lookat+7] == '0' &&
+                tagData[lookat+8] == '0') {
+            foundLyrics = true;
+
+            // The length (6 bytes) comes just before LYRICS200, and is the
+            //  size including the LYRICSBEGIN but excluding the 
+            //  length+LYRICS200 at the end.
+            int length = Integer.parseInt(
+                    new String(tagData, lookat-6, 6, UTF_8)
+            );
+
+            String lyrics = new String(
+                    tagData, lookat-length+5, length-11,
+                    US_ASCII
+            );
+
+            // Tags are a 3 letter code, 5 digit length, then data
+            int pos = 0;
+            while(pos < lyrics.length()-8) {
+                String tagName = lyrics.substring(pos, pos+3);
+                int tagLen = Integer.parseInt(
+                        lyrics.substring(pos+3, pos+8)
+                );
+                int startPos = pos + 8;
+                int endPos = startPos + tagLen;
+
+                if(tagName.equals("LYR")) {
+                    lyricsText = lyrics.substring(startPos, endPos);
+                }
+
+                pos = endPos;
+            }
+        }
+    }
+
+    public boolean hasID3v1() {
+        if(id3v1 == null || id3v1.found == false) {
+            return false;
+        }
+        return true;
+    }
+    public boolean hasLyrics() {
+        return lyricsText != null && lyricsText.length() > 0;
+    }
+
+    /**
+     * Reads and returns the last <code>length</code> bytes from the
+     * given stream.
+     * @param stream input stream
+     * @param length number of bytes from the end to read and return
+     * @return stream the <code>InputStream</code> to read from.
+     * @throws IOException if the stream could not be read from.
+     */
+    protected static byte[] getSuffix(InputStream stream, int length)
+            throws IOException {
+        byte[] buffer = new byte[2 * length];
+        int bytesInBuffer = 0;
+
+        int n = stream.read(buffer);
+        while (n != -1) {
+            bytesInBuffer += n;
+            if (bytesInBuffer == buffer.length) {
+                System.arraycopy(buffer, bytesInBuffer - length, buffer, 0, length);
+                bytesInBuffer = length;
+            }
+            n = stream.read(buffer, bytesInBuffer, buffer.length - bytesInBuffer);
+        }
+
+        if (bytesInBuffer < length) {
+            length = bytesInBuffer;
+        }
+
+        byte[] result = new byte[length];
+        System.arraycopy(buffer, bytesInBuffer - length, result, 0, length);
+        return result;
+    }
+}

Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/MP3Frame.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/MP3Frame.java?rev=1722029&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/MP3Frame.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/MP3Frame.java Mon Dec 28 23:22:46 2015
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+
+/**
+ * A frame in an MP3 file, such as ID3v2 Tags or some
+ *  audio.
+ */
+public interface MP3Frame {
+}