You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by bo...@apache.org on 2015/12/29 00:22:47 UTC
svn commit: r1722029 [2/4] - in /tika/branches/2.x: tika-parser-modules/
tika-parser-modules/tika-multimedia-module/
tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/module/
tika-parser-modules/tika-multimedia-module/src/main/ja...
Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/TiffParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/TiffParser.java?rev=1722029&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/TiffParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/TiffParser.java Mon Dec 28 23:22:46 2015
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.image;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.image.xmp.JempboxExtractor;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class TiffParser extends AbstractParser {
+
+ /**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = -3941143576535464926L;
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.singleton(MediaType.image("tiff"));
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ TemporaryResources tmp = new TemporaryResources();
+ try {
+ TikaInputStream tis = TikaInputStream.get(stream, tmp);
+ new ImageMetadataExtractor(metadata).parseTiff(tis.getFile());
+ new JempboxExtractor(metadata).parse(tis);
+ } finally {
+ tmp.dispose();
+ }
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ xhtml.endDocument();
+ }
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/WebPParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/WebPParser.java?rev=1722029&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/WebPParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/WebPParser.java Mon Dec 28 23:22:46 2015
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.image;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+
+public class WebPParser extends AbstractParser {
+
+ /**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = -3941143576535464926L;
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.singleton(MediaType.image("webp"));
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ TemporaryResources tmp = new TemporaryResources();
+ try {
+ TikaInputStream tis = TikaInputStream.get(stream, tmp);
+ new ImageMetadataExtractor(metadata).parseWebP(tis.getFile());
+ } finally {
+ tmp.dispose();
+ }
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ xhtml.endDocument();
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java?rev=1722029&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java Mon Dec 28 23:22:46 2015
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.image.xmp;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.util.List;
+
+import org.apache.jempbox.xmp.XMPMetadata;
+import org.apache.jempbox.xmp.XMPSchemaDublinCore;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.xml.sax.InputSource;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+public class JempboxExtractor {
+
+ // The XMP spec says it must be unicode, but for most file formats it specifies "must be encoded in UTF-8"
+ private static final String DEFAULT_XMP_CHARSET = UTF_8.name();
+ private XMPPacketScanner scanner = new XMPPacketScanner();
+ private Metadata metadata;
+
+ public JempboxExtractor(Metadata metadata) {
+ this.metadata = metadata;
+ }
+
+ public void parse(InputStream file) throws IOException, TikaException {
+ ByteArrayOutputStream xmpraw = new ByteArrayOutputStream();
+ if (!scanner.parse(file, xmpraw)) {
+ return;
+ }
+
+ Reader decoded = new InputStreamReader(
+ new ByteArrayInputStream(xmpraw.toByteArray()),
+ DEFAULT_XMP_CHARSET);
+ try {
+ XMPMetadata xmp = XMPMetadata.load(new InputSource(decoded));
+ XMPSchemaDublinCore dc = xmp.getDublinCoreSchema();
+ if (dc != null) {
+ if (dc.getTitle() != null) {
+ metadata.set(TikaCoreProperties.TITLE, dc.getTitle());
+ }
+ if (dc.getDescription() != null) {
+ metadata.set(TikaCoreProperties.DESCRIPTION, dc.getDescription());
+ }
+ if (dc.getCreators() != null && dc.getCreators().size() > 0) {
+ metadata.set(TikaCoreProperties.CREATOR, joinCreators(dc.getCreators()));
+ }
+ if (dc.getSubjects() != null && dc.getSubjects().size() > 0) {
+ for (String keyword : dc.getSubjects()) {
+ metadata.add(TikaCoreProperties.KEYWORDS, keyword);
+ }
+ // TODO should we set KEYWORDS too?
+ // All tested photo managers set the same in Iptc.Application2.Keywords and Xmp.dc.subject
+ }
+ }
+ } catch (IOException e) {
+ // Could not parse embedded XMP metadata. That's not a serious
+ // problem, so we'll just ignore the issue for now.
+ // TODO: Make error handling like this configurable.
+ }
+ }
+
+ protected String joinCreators(List<String> creators) {
+ if (creators == null || creators.size() == 0) {
+ return "";
+ }
+ if (creators.size() == 1) {
+ return creators.get(0);
+ }
+ StringBuffer c = new StringBuffer();
+ for (String s : creators) {
+ c.append(", ").append(s);
+ }
+ return c.substring(2);
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/xmp/XMPPacketScanner.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/xmp/XMPPacketScanner.java?rev=1722029&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/xmp/XMPPacketScanner.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/image/xmp/XMPPacketScanner.java Mon Dec 28 23:22:46 2015
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* $Id: XMPPacketParser.java 750418 2009-03-05 11:03:54Z vhennebert $ */
+
+package org.apache.tika.parser.image.xmp;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+
+import static java.nio.charset.StandardCharsets.US_ASCII;
+
+/**
+ * This class is a parser for XMP packets. By default, it tries to locate the first XMP packet
+ * it finds and parses it.
+ * <p/>
+ * Important: Before you use this class to look for an XMP packet in some random file, please read
+ * the chapter on "Scanning Files for XMP Packets" in the XMP specification!
+ * <p/>
+ * Thic class was branched from http://xmlgraphics.apache.org/ XMPPacketParser.
+ * See also org.semanticdesktop.aperture.extractor.xmp.XMPExtractor, a variant.
+ */
+public class XMPPacketScanner {
+
+ private static final byte[] PACKET_HEADER;
+ private static final byte[] PACKET_HEADER_END;
+ private static final byte[] PACKET_TRAILER;
+
+ static {
+ PACKET_HEADER = "<?xpacket begin=".getBytes(US_ASCII);
+ PACKET_HEADER_END = "?>".getBytes(US_ASCII);
+ PACKET_TRAILER = "<?xpacket".getBytes(US_ASCII);
+ }
+
+ private static boolean skipAfter(InputStream in, byte[] match) throws IOException {
+ return skipAfter(in, match, null);
+ }
+
+ private static boolean skipAfter(InputStream in, byte[] match, OutputStream out)
+ throws IOException {
+ int found = 0;
+ int len = match.length;
+ int b;
+ while ((b = in.read()) >= 0) {
+ if (b == match[found]) {
+ found++;
+ if (found == len) {
+ return true;
+ }
+ } else {
+ if (out != null) {
+ if (found > 0) {
+ out.write(match, 0, found);
+ }
+ out.write(b);
+ }
+ found = 0;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * Locates an XMP packet in a stream, parses it and returns the XMP metadata. If no
+ * XMP packet is found until the stream ends, null is returned. Note: This method
+ * only finds the first XMP packet in a stream. And it cannot determine whether it
+ * has found the right XMP packet if there are multiple packets.
+ * <p/>
+ * Does <em>not</em> close the stream.
+ * If XMP block was found reading can continue below the block.
+ *
+ * @param in the InputStream to search
+ * @param xmlOut to write the XMP packet to
+ * @return true if XMP packet is found, false otherwise
+ * @throws IOException if an I/O error occurs
+ * @throws TransformerException if an error occurs while parsing the XMP packet
+ */
+ public boolean parse(InputStream in, OutputStream xmlOut) throws IOException {
+ if (!in.markSupported()) {
+ in = new java.io.BufferedInputStream(in);
+ }
+ boolean foundXMP = skipAfter(in, PACKET_HEADER);
+ if (!foundXMP) {
+ return false;
+ }
+ //TODO Inspect "begin" attribute!
+ if (!skipAfter(in, PACKET_HEADER_END)) {
+ throw new IOException("Invalid XMP packet header!");
+ }
+ //TODO Do with TeeInputStream when Commons IO 1.4 is available
+ if (!skipAfter(in, PACKET_TRAILER, xmlOut)) {
+ throw new IOException("XMP packet not properly terminated!");
+ }
+ return true;
+ }
+
+}
+
Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java?rev=1722029&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java Mon Dec 28 23:22:46 2015
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.jpeg;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.image.ImageMetadataExtractor;
+import org.apache.tika.parser.image.xmp.JempboxExtractor;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class JpegParser extends AbstractParser {
+
+ /**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = -1355028253756234603L;
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.singleton(MediaType.image("jpeg"));
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ TemporaryResources tmp = new TemporaryResources();
+ try {
+ TikaInputStream tis = TikaInputStream.get(stream, tmp);
+ new ImageMetadataExtractor(metadata).parseJpeg(tis.getFile());
+ new JempboxExtractor(metadata).parse(tis);
+ } finally {
+ tmp.dispose();
+ }
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ xhtml.endDocument();
+ }
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/AudioFrame.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/AudioFrame.java?rev=1722029&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/AudioFrame.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/AudioFrame.java Mon Dec 28 23:22:46 2015
@@ -0,0 +1,252 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.exception.TikaException;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * An Audio Frame in an MP3 file. These come after the ID3v2 tags in the file.
+ * Currently, only the header is processed, not the raw audio data.
+ */
+public class AudioFrame implements MP3Frame {
+ /** Constant for the MPEG version 1. */
+ public static final int MPEG_V1 = 3;
+
+ /** Constant for the MPEG version 2. */
+ public static final int MPEG_V2 = 2;
+
+ /** Constant for the MPEG version 2.5. */
+ public static final int MPEG_V2_5 = 0;
+
+ /** Constant for audio layer 1. */
+ public static final int LAYER_1 = 3;
+
+ /** Constant for audio layer 2. */
+ public static final int LAYER_2 = 2;
+
+ /** Constant for audio layer 3. */
+ public static final int LAYER_3 = 1;
+
+ private final String version;
+ private final int versionCode;
+ private final int layer;
+ private final int sampleRate;
+ private final int channels;
+ private final int bitRate;
+ private final int length;
+ private final float duration;
+
+ public String getVersion() {
+ return version;
+ }
+
+ /**
+ * Get the sampling rate, in Hz
+ */
+ public int getSampleRate() {
+ return sampleRate;
+ }
+
+ /**
+ * Get the number of channels (1=mono, 2=stereo)
+ */
+ public int getChannels() {
+ return channels;
+ }
+
+ /**
+ * Get the version code.
+ * @return the version code (one of the {@code MPEG} constants)
+ */
+ public int getVersionCode()
+ {
+ return versionCode;
+ }
+
+ /**
+ * Get the audio layer code.
+ * @return the audio layer (one of the {@code LAYER} constants)
+ */
+ public int getLayer()
+ {
+ return layer;
+ }
+
+ /**
+ * Get the bit rate in bit per second.
+ * @return the bit rate
+ */
+ public int getBitRate()
+ {
+ return bitRate;
+ }
+
+ /**
+ * Returns the frame length in bytes.
+ * @return the frame length
+ */
+ public int getLength()
+ {
+ return length;
+ }
+
+ /**
+ * Returns the duration in milliseconds.
+ * @return the duration
+ */
+ public float getDuration()
+ {
+ return duration;
+ }
+
+ /**
+ * Does this appear to be a 4 byte audio frame header?
+ */
+ public static boolean isAudioHeader(int h1, int h2, int h3, int h4) {
+ if (h1 == -1 || h2 == -1 || h3 == -1 || h4 == -1) {
+ return false;
+ }
+ // Check for the magic 11 bits set at the start
+ // Note - doesn't do a CRC check
+ if (h1 == 0xff && (h2 & 0x60) == 0x60) {
+ return true;
+ }
+ return false;
+ }
+
+ /**
+ * @deprecated Use the constructor which is passed all values directly.
+ */
+ @Deprecated
+ public AudioFrame(InputStream stream, ContentHandler handler)
+ throws IOException, SAXException, TikaException {
+ this(-2, -2, -2, -2, stream);
+ }
+
+ /**
+ * @deprecated Use the constructor which is passed all values directly.
+ */
+ @Deprecated
+ public AudioFrame(int h1, int h2, int h3, int h4, InputStream in)
+ throws IOException {
+ if (h1 == -2 && h2 == -2 && h3 == -2 && h4 == -2) {
+ h1 = in.read();
+ h2 = in.read();
+ h3 = in.read();
+ h4 = in.read();
+ }
+
+ if (isAudioHeader(h1, h2, h3, h4)) {
+ layer = (h2 >> 1) & 0x03;
+ versionCode = (h2 >> 3) & 0x03;
+ version = generateVersionStr(versionCode, layer);
+
+ int rateCode = (h3 >> 2) & 0x03;
+ int rate;
+ switch (rateCode) {
+ case 0:
+ rate = 11025;
+ break;
+ case 1:
+ rate = 12000;
+ break;
+ default:
+ rate = 8000;
+ }
+ if (versionCode == MPEG_V2) {
+ rate *= 2;
+ } else if(versionCode == MPEG_V1) {
+ rate *= 4;
+ }
+ sampleRate = rate;
+
+ int chans = h4 & 0x192;
+ if (chans < 3) {
+ // Stereo, joint stereo, dual channel
+ channels = 2;
+ } else {
+ channels = 1;
+ }
+ bitRate = 0;
+ duration = 0;
+ length = 0;
+ } else {
+ throw new IllegalArgumentException("Magic Audio Frame Header not found");
+ }
+ }
+
+ /**
+ *
+ * Creates a new instance of {@code AudioFrame} and initializes all properties.
+ * @param mpegVersion the code for the MPEG version
+ * @param layer the code for the layer
+ * @param bitRate the bit rate (in bps)
+ * @param sampleRate the sample rate (in samples per second)
+ * @param channels the number of channels
+ * @param length the frame length (in bytes)
+ * @param duration the duration of this frame (in milliseconds)
+ */
+ public AudioFrame(int mpegVersion, int layer, int bitRate, int sampleRate,
+ int channels, int length, float duration) {
+ versionCode = mpegVersion;
+ this.layer = layer;
+ this.bitRate = bitRate;
+ this.sampleRate = sampleRate;
+ this.channels = channels;
+ this.length = length;
+ this.duration = duration;
+ version = generateVersionStr(mpegVersion, layer);
+ }
+
+ /**
+ * Generates a string for the version of this audio frame.
+ * @param version the code for the MPEG version
+ * @param layer the code for the layer
+ * @return a string for the version
+ */
+ private static String generateVersionStr(int version, int layer) {
+ StringBuilder buf = new StringBuilder(64);
+ buf.append("MPEG 3 Layer ");
+ if (layer == LAYER_3) {
+ buf.append("III");
+ } else if (layer == LAYER_2) {
+ buf.append("II");
+ } else if (layer == LAYER_1) {
+ buf.append("I");
+ } else {
+ buf.append("(reserved)");
+ }
+
+ buf.append(" Version ");
+ if (version == MPEG_V2_5) {
+ buf.append("2.5");
+ } else if(version == MPEG_V2) {
+ buf.append("2");
+ } else if(version == MPEG_V1) {
+ buf.append("1");
+ } else {
+ buf.append("(reseved)");
+ }
+
+ return buf.toString();
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/CompositeTagHandler.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/CompositeTagHandler.java?rev=1722029&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/CompositeTagHandler.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/CompositeTagHandler.java Mon Dec 28 23:22:46 2015
@@ -0,0 +1,142 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.util.Collections;
+import java.util.List;
+
+/**
+ * Takes an array of {@link ID3Tags} in preference order, and when asked for
+ * a given tag, will return it from the first {@link ID3Tags} that has it.
+ */
+public class CompositeTagHandler implements ID3Tags {
+
+ private ID3Tags[] tags;
+
+ public CompositeTagHandler(ID3Tags[] tags) {
+ this.tags = tags;
+ }
+
+ public boolean getTagsPresent() {
+ for (ID3Tags tag : tags) {
+ if (tag.getTagsPresent()) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ public String getTitle() {
+ for (ID3Tags tag : tags) {
+ if (tag.getTitle() != null) {
+ return tag.getTitle();
+ }
+ }
+ return null;
+ }
+
+ public String getArtist() {
+ for (ID3Tags tag : tags) {
+ if (tag.getArtist() != null) {
+ return tag.getArtist();
+ }
+ }
+ return null;
+ }
+
+ public String getAlbum() {
+ for (ID3Tags tag : tags) {
+ if (tag.getAlbum() != null) {
+ return tag.getAlbum();
+ }
+ }
+ return null;
+ }
+
+ public String getComposer() {
+ for (ID3Tags tag : tags) {
+ if (tag.getComposer() != null) {
+ return tag.getComposer();
+ }
+ }
+ return null;
+ }
+
+ public String getYear() {
+ for (ID3Tags tag : tags) {
+ if (tag.getYear() != null) {
+ return tag.getYear();
+ }
+ }
+ return null;
+ }
+
+ public List<ID3Comment> getComments() {
+ for (ID3Tags tag : tags) {
+ List<ID3Comment> comments = tag.getComments();
+ if (comments != null && comments.size() > 0) {
+ return comments;
+ }
+ }
+ return Collections.emptyList();
+ }
+
+ public String getGenre() {
+ for (ID3Tags tag : tags) {
+ if (tag.getGenre() != null) {
+ return tag.getGenre();
+ }
+ }
+ return null;
+ }
+
+ public String getTrackNumber() {
+ for (ID3Tags tag : tags) {
+ if (tag.getTrackNumber() != null) {
+ return tag.getTrackNumber();
+ }
+ }
+ return null;
+ }
+
+ public String getAlbumArtist() {
+ for (ID3Tags tag : tags) {
+ if (tag.getAlbumArtist() != null) {
+ return tag.getAlbumArtist();
+ }
+ }
+ return null;
+ }
+
+ public String getDisc() {
+ for (ID3Tags tag : tags) {
+ if (tag.getDisc() != null) {
+ return tag.getDisc();
+ }
+ }
+ return null;
+ }
+
+ public String getCompilation() {
+ for (ID3Tags tag : tags) {
+ if (tag.getCompilation() != null) {
+ return tag.getCompilation();
+ }
+ }
+ return null;
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3Tags.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3Tags.java?rev=1722029&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3Tags.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3Tags.java Mon Dec 28 23:22:46 2015
@@ -0,0 +1,254 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.util.List;
+
+/**
+ * Interface that defines the common interface for ID3 tag parsers,
+ * such as ID3v1 and ID3v2.3.
+ * Implementations should return NULL if the file lacks a given
+ * tag, or if the tag isn't defined for the version.
+ *
+ * Note that so far, only the ID3v1 core tags are listed here. In
+ * future, we may wish to add more to cover the extra tags that
+ * our ID3v2 handlers can produce.
+ */
+public interface ID3Tags {
+ /**
+ * List of predefined genres.
+ *
+ * @see http://www.id3.org/id3v2-00
+ */
+ String[] GENRES = new String[] {
+ /* 0 */ "Blues",
+ /* 1 */ "Classic Rock",
+ /* 2 */ "Country",
+ /* 3 */ "Dance",
+ /* 4 */ "Disco",
+ /* 5 */ "Funk",
+ /* 6 */ "Grunge",
+ /* 7 */ "Hip-Hop",
+ /* 8 */ "Jazz",
+ /* 9 */ "Metal",
+ /* 10 */ "New Age",
+ /* 11 */ "Oldies",
+ /* 12 */ "Other",
+ /* 13 */ "Pop",
+ /* 14 */ "R&B",
+ /* 15 */ "Rap",
+ /* 16 */ "Reggae",
+ /* 17 */ "Rock",
+ /* 18 */ "Techno",
+ /* 19 */ "Industrial",
+ /* 20 */ "Alternative",
+ /* 21 */ "Ska",
+ /* 22 */ "Death Metal",
+ /* 23 */ "Pranks",
+ /* 24 */ "Soundtrack",
+ /* 25 */ "Euro-Techno",
+ /* 26 */ "Ambient",
+ /* 27 */ "Trip-Hop",
+ /* 28 */ "Vocal",
+ /* 29 */ "Jazz+Funk",
+ /* 30 */ "Fusion",
+ /* 31 */ "Trance",
+ /* 32 */ "Classical",
+ /* 33 */ "Instrumental",
+ /* 34 */ "Acid",
+ /* 35 */ "House",
+ /* 36 */ "Game",
+ /* 37 */ "Sound Clip",
+ /* 38 */ "Gospel",
+ /* 39 */ "Noise",
+ /* 40 */ "AlternRock",
+ /* 41 */ "Bass",
+ /* 42 */ "Soul",
+ /* 43 */ "Punk",
+ /* 44 */ "Space",
+ /* 45 */ "Meditative",
+ /* 46 */ "Instrumental Pop",
+ /* 47 */ "Instrumental Rock",
+ /* 48 */ "Ethnic",
+ /* 49 */ "Gothic",
+ /* 50 */ "Darkwave",
+ /* 51 */ "Techno-Industrial",
+ /* 52 */ "Electronic",
+ /* 53 */ "Pop-Folk",
+ /* 54 */ "Eurodance",
+ /* 55 */ "Dream",
+ /* 56 */ "Southern Rock",
+ /* 57 */ "Comedy",
+ /* 58 */ "Cult",
+ /* 59 */ "Gangsta",
+ /* 60 */ "Top 40",
+ /* 61 */ "Christian Rap",
+ /* 62 */ "Pop/Funk",
+ /* 63 */ "Jungle",
+ /* 64 */ "Native American",
+ /* 65 */ "Cabaret",
+ /* 66 */ "New Wave",
+ /* 67 */ "Psychadelic",
+ /* 68 */ "Rave",
+ /* 69 */ "Showtunes",
+ /* 70 */ "Trailer",
+ /* 71 */ "Lo-Fi",
+ /* 72 */ "Tribal",
+ /* 73 */ "Acid Punk",
+ /* 74 */ "Acid Jazz",
+ /* 75 */ "Polka",
+ /* 76 */ "Retro",
+ /* 77 */ "Musical",
+ /* 78 */ "Rock & Roll",
+ /* 79 */ "Hard Rock",
+ /* 80 */ "Folk",
+ /* 81 */ "Folk-Rock",
+ /* 82 */ "National Folk",
+ /* 83 */ "Swing",
+ /* 84 */ "Fast Fusion",
+ /* 85 */ "Bebob",
+ /* 86 */ "Latin",
+ /* 87 */ "Revival",
+ /* 88 */ "Celtic",
+ /* 89 */ "Bluegrass",
+ /* 90 */ "Avantgarde",
+ /* 91 */ "Gothic Rock",
+ /* 92 */ "Progressive Rock",
+ /* 93 */ "Psychedelic Rock",
+ /* 94 */ "Symphonic Rock",
+ /* 95 */ "Slow Rock",
+ /* 96 */ "Big Band",
+ /* 97 */ "Chorus",
+ /* 98 */ "Easy Listening",
+ /* 99 */ "Acoustic",
+ /* 100 */ "Humour",
+ /* 101 */ "Speech",
+ /* 102 */ "Chanson",
+ /* 103 */ "Opera",
+ /* 104 */ "Chamber Music",
+ /* 105 */ "Sonata",
+ /* 106 */ "Symphony",
+ /* 107 */ "Booty Bass",
+ /* 108 */ "Primus",
+ /* 109 */ "Porn Groove",
+ /* 110 */ "Satire",
+ /* 111 */ "Slow Jam",
+ /* 112 */ "Club",
+ /* 113 */ "Tango",
+ /* 114 */ "Samba",
+ /* 115 */ "Folklore",
+ /* 116 */ "Ballad",
+ /* 117 */ "Power Ballad",
+ /* 118 */ "Rhythmic Soul",
+ /* 119 */ "Freestyle",
+ /* 120 */ "Duet",
+ /* 121 */ "Punk Rock",
+ /* 122 */ "Drum Solo",
+ /* 123 */ "A capella",
+ /* 124 */ "Euro-House",
+ /* 125 */ "Dance Hall",
+ /* sentinel */ ""
+ };
+
+ /**
+ * Does the file contain this kind of tags?
+ */
+ boolean getTagsPresent();
+
+ String getTitle();
+
+ /**
+ * The Artist for the track
+ */
+ String getArtist();
+
+ /**
+ * The Artist for the overall album / compilation of albums
+ */
+ String getAlbumArtist();
+
+ String getAlbum();
+
+ String getComposer();
+
+ String getCompilation();
+
+ /**
+ * Retrieves the comments, if any.
+ * Files may have more than one comment, but normally only
+ * one with any language/description pair.
+ */
+ List<ID3Comment> getComments();
+
+ String getGenre();
+
+ String getYear();
+
+ /**
+ * The number of the track within the album / recording
+ */
+ String getTrackNumber();
+
+ /**
+ * The number of the disc this belongs to, within the set
+ */
+ String getDisc();
+
+ /**
+ * Represents a comments in ID3 (especially ID3 v2), where are
+ * made up of several parts
+ */
+ public static class ID3Comment {
+ private String language;
+ private String description;
+ private String text;
+
+ /**
+ * Creates an ID3 v1 style comment tag
+ */
+ public ID3Comment(String id3v1Text) {
+ this.text = id3v1Text;
+ }
+ /**
+ * Creates an ID3 v2 style comment tag
+ */
+ public ID3Comment(String language, String description, String text) {
+ this.language = language;
+ this.description = description;
+ this.text = text;
+ }
+
+ /**
+ * Gets the language, if present
+ */
+ public String getLanguage() {
+ return language;
+ }
+ /**
+ * Gets the description, if present
+ */
+ public String getDescription() {
+ return description;
+ }
+ /**
+ * Gets the text, if present
+ */
+ public String getText() {
+ return text;
+ }
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java?rev=1722029&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v1Handler.java Mon Dec 28 23:22:46 2015
@@ -0,0 +1,183 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.tika.exception.TikaException;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import static java.nio.charset.StandardCharsets.ISO_8859_1;
+
+/**
+ * This is used to parse ID3 Version 1 Tag information from an MP3 file,
+ * if available.
+ *
+ * @see <a href="http://www.id3.org/ID3v1">MP3 ID3 Version 1 specification</a>
+ */
+public class ID3v1Handler implements ID3Tags {
+ private String title;
+ private String artist;
+ private String album;
+ private String year;
+ private ID3Comment comment;
+ private String genre;
+ private String trackNumber;
+
+ boolean found = false;
+
+ public ID3v1Handler(InputStream stream, ContentHandler handler)
+ throws IOException, SAXException, TikaException {
+ this(LyricsHandler.getSuffix(stream, 128));
+ }
+
+ /**
+ * Creates from the last 128 bytes of a stream.
+ * @param tagData Must be the last 128 bytes
+ */
+ protected ID3v1Handler(byte[] tagData)
+ throws IOException, SAXException, TikaException {
+ if (tagData.length == 128
+ && tagData[0] == 'T' && tagData[1] == 'A' && tagData[2] == 'G') {
+ found = true;
+
+ title = getString(tagData, 3, 33);
+ artist = getString(tagData, 33, 63);
+ album = getString(tagData, 63, 93);
+ year = getString(tagData, 93, 97);
+
+ String commentStr = getString(tagData, 97, 127);
+ comment = new ID3Comment(commentStr);
+
+ int genreID = (int) tagData[127] & 0xff; // unsigned byte
+ genre = GENRES[Math.min(genreID, GENRES.length - 1)];
+
+ // ID3v1.1 Track addition
+ // If the last two bytes of the comment field are zero and
+ // non-zero, then the last byte is the track number
+ if (tagData[125] == 0 && tagData[126] != 0) {
+ int trackNum = (int) tagData[126] & 0xff;
+ trackNumber = Integer.toString(trackNum);
+ }
+ }
+ }
+
+
+ public boolean getTagsPresent() {
+ return found;
+ }
+
+ public String getTitle() {
+ return title;
+ }
+
+ public String getArtist() {
+ return artist;
+ }
+
+ public String getAlbum() {
+ return album;
+ }
+
+ public String getYear() {
+ return year;
+ }
+
+ public List<ID3Comment> getComments() {
+ return Arrays.asList(comment);
+ }
+
+ public String getGenre() {
+ return genre;
+ }
+
+ public String getTrackNumber() {
+ return trackNumber;
+ }
+
+ /**
+ * ID3v1 doesn't have composers,
+ * so returns null;
+ */
+ public String getComposer() {
+ return null;
+ }
+
+ /**
+ * ID3v1 doesn't have album-wide artists,
+ * so returns null;
+ */
+ public String getAlbumArtist() {
+ return null;
+ }
+
+ /**
+ * ID3v1 doesn't have disc numbers,
+ * so returns null;
+ */
+ public String getDisc() {
+ return null;
+ }
+
+ /**
+ * ID3v1 doesn't have compilations,
+ * so returns null;
+ */
+ public String getCompilation() {
+ return null;
+ }
+
+ /**
+ * Returns the identified ISO-8859-1 substring from the given byte buffer.
+ * The return value is the zero-terminated substring retrieved from
+ * between the given start and end positions in the given byte buffer.
+ * Extra whitespace (and control characters) from the beginning and the
+ * end of the substring is removed.
+ *
+ * @param buffer byte buffer
+ * @param start start index of the substring
+ * @param end end index of the substring
+ * @return the identified substring
+ * @throws TikaException if the ISO-8859-1 encoding is not available
+ */
+ private static String getString(byte[] buffer, int start, int end)
+ throws TikaException {
+ // Find the zero byte that marks the end of the string
+ int zero = start;
+ while (zero < end && buffer[zero] != 0) {
+ zero++;
+ }
+
+ // Skip trailing whitespace
+ end = zero;
+ while (start < end && buffer[end - 1] <= ' ') {
+ end--;
+ }
+
+ // Skip leading whitespace
+ while (start < end && buffer[start] <= ' ') {
+ start++;
+ }
+
+ // Return the remaining substring
+ return new String(buffer, start, end - start, ISO_8859_1);
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v22Handler.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v22Handler.java?rev=1722029&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v22Handler.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v22Handler.java Mon Dec 28 23:22:46 2015
@@ -0,0 +1,159 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.mp3.ID3v2Frame.RawTag;
+import org.apache.tika.parser.mp3.ID3v2Frame.RawTagIterator;
+import org.xml.sax.SAXException;
+
+/**
+ * This is used to parse ID3 Version 2.2 Tag information from an MP3 file,
+ * if available.
+ *
+ * @see <a href="http://id3lib.sourceforge.net/id3/id3v2-00.txt">MP3 ID3 Version 2.2 specification</a>
+ */
+public class ID3v22Handler implements ID3Tags {
+ private String title;
+ private String artist;
+ private String album;
+ private String year;
+ private String composer;
+ private String genre;
+ private String trackNumber;
+ private String albumArtist;
+ private String disc;
+ private List<ID3Comment> comments = new ArrayList<ID3Comment>();
+
+ public ID3v22Handler(ID3v2Frame frame)
+ throws IOException, SAXException, TikaException {
+ RawTagIterator tags = new RawV22TagIterator(frame);
+ while (tags.hasNext()) {
+ RawTag tag = tags.next();
+ if (tag.name.equals("TT2")) {
+ title = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("TP1")) {
+ artist = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("TP2")) {
+ albumArtist = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("TAL")) {
+ album = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("TYE")) {
+ year = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("TCM")) {
+ composer = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("COM")) {
+ comments.add( getComment(tag.data, 0, tag.data.length) );
+ } else if (tag.name.equals("TRK")) {
+ trackNumber = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("TPA")) {
+ disc = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("TCO")) {
+ genre = extractGenre( getTagString(tag.data, 0, tag.data.length) );
+ }
+ }
+ }
+
+ private String getTagString(byte[] data, int offset, int length) {
+ return ID3v2Frame.getTagString(data, offset, length);
+ }
+ private ID3Comment getComment(byte[] data, int offset, int length) {
+ return ID3v2Frame.getComment(data, offset, length);
+ }
+
+ protected static String extractGenre(String rawGenre) {
+ int open = rawGenre.indexOf("(");
+ int close = rawGenre.indexOf(")");
+ if (open == -1 && close == -1) {
+ return rawGenre;
+ } else if (open < close) {
+ String genreStr = rawGenre.substring(0, open).trim();
+ try {
+ int genreID = Integer.parseInt(rawGenre.substring(open+1, close));
+ return ID3Tags.GENRES[genreID];
+ } catch(ArrayIndexOutOfBoundsException invalidNum) {
+ return genreStr;
+ } catch(NumberFormatException notANum) {
+ return genreStr;
+ }
+ } else {
+ return null;
+ }
+ }
+
+ public boolean getTagsPresent() {
+ return true;
+ }
+
+ public String getTitle() {
+ return title;
+ }
+
+ public String getArtist() {
+ return artist;
+ }
+
+ public String getAlbum() {
+ return album;
+ }
+
+ public String getYear() {
+ return year;
+ }
+
+ public String getComposer() {
+ return composer;
+ }
+
+ public List<ID3Comment> getComments() {
+ return comments;
+ }
+
+ public String getGenre() {
+ return genre;
+ }
+
+ public String getTrackNumber() {
+ return trackNumber;
+ }
+
+ public String getAlbumArtist() {
+ return albumArtist;
+ }
+
+ public String getDisc() {
+ return disc;
+ }
+
+ /**
+ * ID3v22 doesn't have compilations,
+ * so returns null;
+ */
+ public String getCompilation() {
+ return null;
+ }
+
+ private class RawV22TagIterator extends RawTagIterator {
+ private RawV22TagIterator(ID3v2Frame frame) {
+ frame.super(3, 3, 1, 0);
+ }
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v23Handler.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v23Handler.java?rev=1722029&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v23Handler.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v23Handler.java Mon Dec 28 23:22:46 2015
@@ -0,0 +1,138 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.mp3.ID3v2Frame.RawTag;
+import org.apache.tika.parser.mp3.ID3v2Frame.RawTagIterator;
+import org.xml.sax.SAXException;
+
+/**
+ * This is used to parse ID3 Version 2.3 Tag information from an MP3 file,
+ * if available.
+ *
+ * @see <a href="http://id3lib.sourceforge.net/id3/id3v2.3.0.html">MP3 ID3 Version 2.3 specification</a>
+ */
+public class ID3v23Handler implements ID3Tags {
+ private String title;
+ private String artist;
+ private String album;
+ private String year;
+ private String composer;
+ private String genre;
+ private String trackNumber;
+ private String albumArtist;
+ private String disc;
+ private String compilation;
+ private List<ID3Comment> comments = new ArrayList<ID3Comment>();
+
+ public ID3v23Handler(ID3v2Frame frame)
+ throws IOException, SAXException, TikaException {
+ RawTagIterator tags = new RawV23TagIterator(frame);
+ while (tags.hasNext()) {
+ RawTag tag = tags.next();
+ if (tag.name.equals("TIT2")) {
+ title = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("TPE1")) {
+ artist = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("TPE2")) {
+ albumArtist = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("TALB")) {
+ album = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("TYER")) {
+ year = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("TCOM")) {
+ composer = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("COMM")) {
+ comments.add( getComment(tag.data, 0, tag.data.length) );
+ } else if (tag.name.equals("TRCK")) {
+ trackNumber = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("TPOS")) {
+ disc = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("TCMP")) {
+ compilation = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("TCON")) {
+ genre = ID3v22Handler.extractGenre( getTagString(tag.data, 0, tag.data.length) );
+ }
+ }
+ }
+
+ private String getTagString(byte[] data, int offset, int length) {
+ return ID3v2Frame.getTagString(data, offset, length);
+ }
+ private ID3Comment getComment(byte[] data, int offset, int length) {
+ return ID3v2Frame.getComment(data, offset, length);
+ }
+
+ public boolean getTagsPresent() {
+ return true;
+ }
+
+ public String getTitle() {
+ return title;
+ }
+
+ public String getArtist() {
+ return artist;
+ }
+
+ public String getAlbum() {
+ return album;
+ }
+
+ public String getYear() {
+ return year;
+ }
+
+ public String getComposer() {
+ return composer;
+ }
+
+ public List<ID3Comment> getComments() {
+ return comments;
+ }
+
+ public String getGenre() {
+ return genre;
+ }
+
+ public String getTrackNumber() {
+ return trackNumber;
+ }
+
+ public String getAlbumArtist() {
+ return albumArtist;
+ }
+
+ public String getDisc() {
+ return disc;
+ }
+
+ public String getCompilation() {
+ return compilation;
+ }
+
+ private class RawV23TagIterator extends RawTagIterator {
+ private RawV23TagIterator(ID3v2Frame frame) {
+ frame.super(4, 4, 1, 2);
+ }
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v24Handler.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v24Handler.java?rev=1722029&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v24Handler.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v24Handler.java Mon Dec 28 23:22:46 2015
@@ -0,0 +1,143 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.mp3.ID3v2Frame.RawTag;
+import org.apache.tika.parser.mp3.ID3v2Frame.RawTagIterator;
+import org.xml.sax.SAXException;
+
+/**
+ * This is used to parse ID3 Version 2.4 Tag information from an MP3 file,
+ * if available.
+ *
+ * @see <a href="http://www.id3.org/id3v2.4.0-structure">MP3 ID3 Version 2.4 specification</a>
+ * @see <a href="http://www.id3.org/id3v2.4.0-frames">MP3 ID3 Version 2.4 frames/tags</a>
+ */
+public class ID3v24Handler implements ID3Tags {
+ private String title;
+ private String artist;
+ private String album;
+ private String year;
+ private String composer;
+ private String genre;
+ private String trackNumber;
+ private String albumArtist;
+ private String disc;
+ private String compilation;
+ private List<ID3Comment> comments = new ArrayList<ID3Comment>();
+
+ public ID3v24Handler(ID3v2Frame frame)
+ throws IOException, SAXException, TikaException {
+ RawTagIterator tags = new RawV24TagIterator(frame);
+ while (tags.hasNext()) {
+ RawTag tag = tags.next();
+ if (tag.name.equals("TIT2")) {
+ title = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("TPE1")) {
+ artist = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("TPE2")) {
+ albumArtist = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("TALB")) {
+ album = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("TYER")) {
+ year = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("TDRC")) {
+ if(year == null) {
+ year = getTagString(tag.data, 0, tag.data.length);
+ }
+ } else if (tag.name.equals("TCOM")) {
+ composer = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("COMM")) {
+ comments.add( getComment(tag.data, 0, tag.data.length) );
+ } else if (tag.name.equals("TRCK")) {
+ trackNumber = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("TPOS")) {
+ disc = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("TCMP")) {
+ compilation = getTagString(tag.data, 0, tag.data.length);
+ } else if (tag.name.equals("TCON")) {
+ genre = ID3v22Handler.extractGenre( getTagString(tag.data, 0, tag.data.length) );
+ }
+ }
+ }
+
+ private String getTagString(byte[] data, int offset, int length) {
+ return ID3v2Frame.getTagString(data, offset, length);
+ }
+ private ID3Comment getComment(byte[] data, int offset, int length) {
+ return ID3v2Frame.getComment(data, offset, length);
+ }
+
+ public boolean getTagsPresent() {
+ return true;
+ }
+
+ public String getTitle() {
+ return title;
+ }
+
+ public String getArtist() {
+ return artist;
+ }
+
+ public String getAlbum() {
+ return album;
+ }
+
+ public String getYear() {
+ return year;
+ }
+
+ public String getComposer() {
+ return composer;
+ }
+
+ public List<ID3Comment> getComments() {
+ return comments;
+ }
+
+ public String getGenre() {
+ return genre;
+ }
+
+ public String getTrackNumber() {
+ return trackNumber;
+ }
+
+ public String getAlbumArtist() {
+ return albumArtist;
+ }
+
+ public String getDisc() {
+ return disc;
+ }
+
+ public String getCompilation() {
+ return compilation;
+ }
+
+ private class RawV24TagIterator extends RawTagIterator {
+ private RawV24TagIterator(ID3v2Frame frame) {
+ frame.super(4, 4, 1, 2);
+ }
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java?rev=1722029&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/ID3v2Frame.java Mon Dec 28 23:22:46 2015
@@ -0,0 +1,424 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.PushbackInputStream;
+import java.io.UnsupportedEncodingException;
+import java.util.Iterator;
+
+import org.apache.tika.parser.mp3.ID3Tags.ID3Comment;
+
+import static java.nio.charset.StandardCharsets.ISO_8859_1;
+
+/**
+ * A frame of ID3v2 data, which is then passed to a handler to
+ * be turned into useful data.
+ */
+public class ID3v2Frame implements MP3Frame {
+ private int majorVersion;
+ private int minorVersion;
+ private int flags;
+ private int length;
+ /** Excludes the header size part */
+ private byte[] extendedHeader;
+ private byte[] data;
+
+ public int getMajorVersion() {
+ return majorVersion;
+ }
+
+ public int getMinorVersion() {
+ return minorVersion;
+ }
+
+ public int getFlags() {
+ return flags;
+ }
+
+ public int getLength() {
+ return length;
+ }
+
+ public byte[] getExtendedHeader() {
+ return extendedHeader;
+ }
+
+ public byte[] getData() {
+ return data;
+ }
+
+ /**
+ * Returns the next ID3v2 Frame in
+ * the file, or null if the next batch of data
+ * doesn't correspond to either an ID3v2 header.
+ * If no ID3v2 frame could be detected and the passed in input stream is a
+ * {@code PushbackInputStream}, the bytes read so far are pushed back so
+ * that they can be read again.
+ * ID3v2 Frames should come before all Audio ones.
+ */
+ public static MP3Frame createFrameIfPresent(InputStream inp)
+ throws IOException {
+ int h1 = inp.read();
+ int h2 = inp.read();
+ int h3 = inp.read();
+
+ // Is it an ID3v2 Frame?
+ if (h1 == (int)'I' && h2 == (int)'D' && h3 == (int)'3') {
+ int majorVersion = inp.read();
+ int minorVersion = inp.read();
+ if (majorVersion == -1 || minorVersion == -1) {
+ pushBack(inp, h1, h2, h3, majorVersion, minorVersion);
+ return null;
+ }
+ return new ID3v2Frame(majorVersion, minorVersion, inp);
+ }
+
+ // Not a frame header
+ pushBack(inp, h1, h2, h3);
+ return null;
+ }
+
+ /**
+ * Pushes bytes back into the stream if possible. This method is called if
+ * no ID3v2 header could be found at the current stream position.
+ *
+ * @param inp the input stream
+ * @param bytes the bytes to be pushed back
+ * @throws IOException if an error occurs
+ */
+ private static void pushBack(InputStream inp, int... bytes)
+ throws IOException
+ {
+ if (inp instanceof PushbackInputStream)
+ {
+ byte[] buf = new byte[bytes.length];
+ for (int i = 0; i < bytes.length; i++)
+ {
+ buf[i] = (byte) bytes[i];
+ }
+ ((PushbackInputStream) inp).unread(buf);
+ }
+ }
+
+ private ID3v2Frame(int majorVersion, int minorVersion, InputStream inp)
+ throws IOException {
+ this.majorVersion = majorVersion;
+ this.minorVersion = minorVersion;
+
+ // Get the flags and the length
+ flags = inp.read();
+ length = get7BitsInt(readFully(inp, 4), 0);
+
+ // Do we have an extended header?
+ if ((flags & 0x02) == 0x02) {
+ int size = getInt(readFully(inp, 4));
+ extendedHeader = readFully(inp, size);
+ }
+
+ // Get the frame's data, or at least as much
+ // of it as we could do
+ data = readFully(inp, length, false);
+ }
+
+ protected static int getInt(byte[] data) {
+ return getInt(data, 0);
+ }
+
+ protected static int getInt(byte[] data, int offset) {
+ int b0 = data[offset+0] & 0xFF;
+ int b1 = data[offset+1] & 0xFF;
+ int b2 = data[offset+2] & 0xFF;
+ int b3 = data[offset+3] & 0xFF;
+ return (b0 << 24) + (b1 << 16) + (b2 << 8) + (b3 << 0);
+ }
+
+ protected static int getInt3(byte[] data, int offset) {
+ int b0 = data[offset+0] & 0xFF;
+ int b1 = data[offset+1] & 0xFF;
+ int b2 = data[offset+2] & 0xFF;
+ return (b0 << 16) + (b1 << 8) + (b2 << 0);
+ }
+
+ protected static int getInt2(byte[] data, int offset) {
+ int b0 = data[offset+0] & 0xFF;
+ int b1 = data[offset+1] & 0xFF;
+ return (b0 << 8) + (b1 << 0);
+ }
+
+ /**
+ * AKA a Synchsafe integer.
+ * 4 bytes hold a 28 bit number. The highest
+ * bit in each byte is always 0 and always ignored.
+ */
+ protected static int get7BitsInt(byte[] data, int offset) {
+ int b0 = data[offset+0] & 0x7F;
+ int b1 = data[offset+1] & 0x7F;
+ int b2 = data[offset+2] & 0x7F;
+ int b3 = data[offset+3] & 0x7F;
+ return (b0 << 21) + (b1 << 14) + (b2 << 7) + (b3 << 0);
+ }
+
+ protected static byte[] readFully(InputStream inp, int length)
+ throws IOException {
+ return readFully(inp, length, true);
+ }
+ protected static byte[] readFully(InputStream inp, int length, boolean shortDataIsFatal)
+ throws IOException {
+ byte[] b = new byte[length];
+
+ int pos = 0;
+ int read;
+ while (pos < length) {
+ read = inp.read(b, pos, length-pos);
+ if (read == -1) {
+ if(shortDataIsFatal) {
+ throw new IOException("Tried to read " + length + " bytes, but only " + pos + " bytes present");
+ } else {
+ // Give them what we found
+ // TODO Log the short read
+ return b;
+ }
+ }
+ pos += read;
+ }
+
+ return b;
+ }
+
+ protected static class TextEncoding {
+ public final boolean doubleByte;
+ public final String encoding;
+ private TextEncoding(String encoding, boolean doubleByte) {
+ this.doubleByte = doubleByte;
+ this.encoding = encoding;
+ }
+ }
+ protected static final TextEncoding[] encodings = new TextEncoding[] {
+ new TextEncoding("ISO-8859-1", false),
+ new TextEncoding("UTF-16", true), // With BOM
+ new TextEncoding("UTF-16BE", true), // Without BOM
+ new TextEncoding("UTF-8", false)
+ };
+
+ /**
+ * Returns the (possibly null padded) String at the given offset and
+ * length. String encoding is held in the first byte;
+ */
+ protected static String getTagString(byte[] data, int offset, int length) {
+ int actualLength = length;
+ if (actualLength == 0) {
+ return "";
+ }
+ if (actualLength == 1 && data[offset] == 0) {
+ return "";
+ }
+
+ // Does it have an encoding flag?
+ // Detect by the first byte being sub 0x20
+ TextEncoding encoding = encodings[0];
+ byte maybeEncodingFlag = data[offset];
+ if (maybeEncodingFlag >= 0 && maybeEncodingFlag < encodings.length) {
+ offset++;
+ actualLength--;
+ encoding = encodings[maybeEncodingFlag];
+ }
+
+ // Trim off null termination / padding (as present)
+ while (encoding.doubleByte && actualLength >= 2 && data[offset+actualLength-1] == 0 && data[offset+actualLength-2] == 0) {
+ actualLength -= 2;
+ }
+ while (!encoding.doubleByte && actualLength >= 1 && data[offset+actualLength-1] == 0) {
+ actualLength--;
+ }
+ if (actualLength == 0) {
+ return "";
+ }
+
+ // TIKA-1024: If it's UTF-16 (with BOM) and all we
+ // have is a naked BOM then short-circuit here
+ // (return empty string), because new String(..)
+ // gives different results on different JVMs
+ if (encoding.encoding.equals("UTF-16") && actualLength == 2 &&
+ ((data[offset] == (byte) 0xff && data[offset+1] == (byte) 0xfe) ||
+ (data[offset] == (byte) 0xfe && data[offset+1] == (byte) 0xff))) {
+ return "";
+ }
+
+ try {
+ // Build the base string
+ return new String(data, offset, actualLength, encoding.encoding);
+ } catch (UnsupportedEncodingException e) {
+ throw new RuntimeException(
+ "Core encoding " + encoding.encoding + " is not available", e);
+ }
+ }
+ /**
+ * Builds up the ID3 comment, by parsing and extracting
+ * the comment string parts from the given data.
+ */
+ protected static ID3Comment getComment(byte[] data, int offset, int length) {
+ // Comments must have an encoding
+ int encodingFlag = data[offset];
+ if (encodingFlag >= 0 && encodingFlag < encodings.length) {
+ // Good, valid flag
+ } else {
+ // Invalid string
+ return null;
+ }
+
+ TextEncoding encoding = encodings[encodingFlag];
+
+ // First is a 3 byte language
+ String lang = getString(data, offset+1, 3);
+
+ // After that we have [Desc]\0(\0)[Text]
+ int descStart = offset+4;
+ int textStart = -1;
+ String description = null;
+ String text = null;
+
+ // Find where the description ends
+ try {
+ for (int i=descStart; i<offset+length; i++) {
+ if (encoding.doubleByte && data[i]==0 && data[i+1] == 0) {
+ // Handle LE vs BE on low byte text
+ if (i+2 < offset+length && data[i+1] == 0 && data[i+2] == 0) {
+ i++;
+ }
+ textStart = i+2;
+ description = new String(data, descStart, i-descStart, encoding.encoding);
+ break;
+ }
+ if (!encoding.doubleByte && data[i]==0) {
+ textStart = i+1;
+ description = new String(data, descStart, i-descStart, encoding.encoding);
+ break;
+ }
+ }
+
+ // Did we find the end?
+ if (textStart > -1) {
+ text = new String(data, textStart, offset+length-textStart, encoding.encoding);
+ } else {
+ // Assume everything is the text
+ text = new String(data, descStart, offset+length-descStart, encoding.encoding);
+ }
+
+ // Return
+ return new ID3Comment(lang, description, text);
+ } catch (UnsupportedEncodingException e) {
+ throw new RuntimeException(
+ "Core encoding " + encoding.encoding + " is not available", e);
+ }
+ }
+
+ /**
+ * Returns the String at the given
+ * offset and length. Strings are ISO-8859-1
+ */
+ protected static String getString(byte[] data, int offset, int length) {
+ return new String(data, offset, length, ISO_8859_1);
+ }
+
+
+ /**
+ * Iterates over id3v2 raw tags.
+ * Create an instance of this that configures the
+ * various length and multipliers.
+ */
+ protected class RawTagIterator implements Iterator<RawTag> {
+ private int nameLength;
+ private int sizeLength;
+ private int sizeMultiplier;
+ private int flagLength;
+
+ private int offset = 0;
+
+ protected RawTagIterator(
+ int nameLength, int sizeLength, int sizeMultiplier,
+ int flagLength) {
+ this.nameLength = nameLength;
+ this.sizeLength = sizeLength;
+ this.sizeMultiplier = sizeMultiplier;
+ this.flagLength = flagLength;
+ }
+
+ public boolean hasNext() {
+ // Check for padding at the end
+ return offset < data.length && data[offset] != 0;
+ }
+
+ public RawTag next() {
+ RawTag tag = new RawTag(nameLength, sizeLength, sizeMultiplier,
+ flagLength, data, offset);
+ offset += tag.getSize();
+ return tag;
+ }
+
+ public void remove() {
+ }
+
+ }
+
+ protected static class RawTag {
+ private int headerSize;
+ protected String name;
+ protected int flag;
+ protected byte[] data;
+
+ private RawTag(
+ int nameLength, int sizeLength, int sizeMultiplier,
+ int flagLength, byte[] frameData, int offset) {
+ headerSize = nameLength + sizeLength + flagLength;
+
+ // Name, normally 3 or 4 bytes
+ name = getString(frameData, offset, nameLength);
+
+ // Size
+ int rawSize;
+ if (sizeLength == 3) {
+ rawSize = getInt3(frameData, offset+nameLength);
+ } else {
+ rawSize = getInt(frameData, offset+nameLength);
+ }
+ int size = rawSize * sizeMultiplier;
+
+ // Flag
+ if (flagLength > 0) {
+ if (flagLength == 1) {
+ flag = (int)frameData[offset+nameLength+sizeLength];
+ } else {
+ flag = getInt2(frameData, offset+nameLength+sizeLength);
+ }
+ }
+
+ // Now data
+ int copyFrom = offset+nameLength+sizeLength+flagLength;
+ size = Math.max(0, Math.min(size, frameData.length-copyFrom)); // TIKA-1218, prevent negative size for malformed files.
+ data = new byte[size];
+ System.arraycopy(frameData, copyFrom, data, 0, size);
+ }
+
+ protected int getSize() {
+ return headerSize + data.length;
+ }
+
+ }
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/LyricsHandler.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/LyricsHandler.java?rev=1722029&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/LyricsHandler.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/LyricsHandler.java Mon Dec 28 23:22:46 2015
@@ -0,0 +1,156 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.exception.TikaException;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import static java.nio.charset.StandardCharsets.US_ASCII;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+/**
+ * This is used to parse Lyrics3 tag information
+ * from an MP3 file, if available.
+ * Handles lyrics tags of up to 10kb in size.
+ * Will process any ID3v1 tag data if present.
+ * Ignores extended ID3v1 data in the lyrics block
+ *
+ * @see <a href="http://www.id3.org/Lyrics3v2">Lyrics3 v2.0 specification</a>
+ */
+public class LyricsHandler {
+ boolean foundLyrics = false;
+ String lyricsText = null;
+ ID3v1Handler id3v1 = null;
+
+ public LyricsHandler(InputStream stream, ContentHandler handler)
+ throws IOException, SAXException, TikaException {
+ this(getSuffix(stream, 10240+128));
+ }
+
+ /**
+ * Looks for the Lyrics data, which will be
+ * just before the ID3v1 data (if present),
+ * and process it.
+ * Also sets things up for the ID3v1
+ * processing if required.
+ * Creates from the last 128 bytes of a stream.
+ */
+ protected LyricsHandler(byte[] tagData)
+ throws IOException, SAXException, TikaException {
+ if(tagData.length < 128) {
+ return;
+ }
+
+ // Is there ID3v1 data?
+ byte[] last128 = new byte[128];
+ System.arraycopy(tagData, tagData.length-128, last128, 0, 128);
+ id3v1 = new ID3v1Handler(last128);
+
+ if(tagData.length < 137) {
+ return;
+ }
+
+ // Are there lyrics? Look for the closing Lyrics tag
+ // at the end to decide if there is any
+ int lookat = tagData.length - 9;
+ if(id3v1.found) {
+ lookat -= 128;
+ }
+ if(tagData[lookat+0] == 'L' && tagData[lookat+1] == 'Y' &&
+ tagData[lookat+2] == 'R' && tagData[lookat+3] == 'I' &&
+ tagData[lookat+4] == 'C' && tagData[lookat+5] == 'S' &&
+ tagData[lookat+6] == '2' && tagData[lookat+7] == '0' &&
+ tagData[lookat+8] == '0') {
+ foundLyrics = true;
+
+ // The length (6 bytes) comes just before LYRICS200, and is the
+ // size including the LYRICSBEGIN but excluding the
+ // length+LYRICS200 at the end.
+ int length = Integer.parseInt(
+ new String(tagData, lookat-6, 6, UTF_8)
+ );
+
+ String lyrics = new String(
+ tagData, lookat-length+5, length-11,
+ US_ASCII
+ );
+
+ // Tags are a 3 letter code, 5 digit length, then data
+ int pos = 0;
+ while(pos < lyrics.length()-8) {
+ String tagName = lyrics.substring(pos, pos+3);
+ int tagLen = Integer.parseInt(
+ lyrics.substring(pos+3, pos+8)
+ );
+ int startPos = pos + 8;
+ int endPos = startPos + tagLen;
+
+ if(tagName.equals("LYR")) {
+ lyricsText = lyrics.substring(startPos, endPos);
+ }
+
+ pos = endPos;
+ }
+ }
+ }
+
+ public boolean hasID3v1() {
+ if(id3v1 == null || id3v1.found == false) {
+ return false;
+ }
+ return true;
+ }
+ public boolean hasLyrics() {
+ return lyricsText != null && lyricsText.length() > 0;
+ }
+
+ /**
+ * Reads and returns the last <code>length</code> bytes from the
+ * given stream.
+ * @param stream input stream
+ * @param length number of bytes from the end to read and return
+ * @return stream the <code>InputStream</code> to read from.
+ * @throws IOException if the stream could not be read from.
+ */
+ protected static byte[] getSuffix(InputStream stream, int length)
+ throws IOException {
+ byte[] buffer = new byte[2 * length];
+ int bytesInBuffer = 0;
+
+ int n = stream.read(buffer);
+ while (n != -1) {
+ bytesInBuffer += n;
+ if (bytesInBuffer == buffer.length) {
+ System.arraycopy(buffer, bytesInBuffer - length, buffer, 0, length);
+ bytesInBuffer = length;
+ }
+ n = stream.read(buffer, bytesInBuffer, buffer.length - bytesInBuffer);
+ }
+
+ if (bytesInBuffer < length) {
+ length = bytesInBuffer;
+ }
+
+ byte[] result = new byte[length];
+ System.arraycopy(buffer, bytesInBuffer - length, result, 0, length);
+ return result;
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/MP3Frame.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/MP3Frame.java?rev=1722029&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/MP3Frame.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-multimedia-module/src/main/java/org/apache/tika/parser/mp3/MP3Frame.java Mon Dec 28 23:22:46 2015
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mp3;
+
+
+/**
+ * A frame in an MP3 file, such as ID3v2 Tags or some
+ * audio.
+ */
+public interface MP3Frame {
+}