You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2009/02/06 19:51:11 UTC
svn commit: r741674 - in /lucene/tika/trunk: pom.xml
src/main/java/org/apache/tika/gui/ParsingTransferHandler.java
src/main/java/org/apache/tika/parser/audio/AudioParser.java
src/main/java/org/apache/tika/parser/audio/MidiParser.java
Author: jukka
Date: Fri Feb 6 18:51:10 2009
New Revision: 741674
URL: http://svn.apache.org/viewvc?rev=741674&view=rev
Log:
TIKA-199: Improved audio detection and parsing
Streamlined the audio and midi parsers. Made sure that both use a BufferedInputStream wrapper as the native javax.sound parsers expect the given input stream to support the mark feature.
Modified:
lucene/tika/trunk/pom.xml
lucene/tika/trunk/src/main/java/org/apache/tika/gui/ParsingTransferHandler.java
lucene/tika/trunk/src/main/java/org/apache/tika/parser/audio/AudioParser.java
lucene/tika/trunk/src/main/java/org/apache/tika/parser/audio/MidiParser.java
Modified: lucene/tika/trunk/pom.xml
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/pom.xml?rev=741674&r1=741673&r2=741674&view=diff
==============================================================================
--- lucene/tika/trunk/pom.xml (original)
+++ lucene/tika/trunk/pom.xml Fri Feb 6 18:51:10 2009
@@ -193,7 +193,7 @@
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
- <version>1.4</version>
+ <version>1.5-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>pdfbox</groupId>
Modified: lucene/tika/trunk/src/main/java/org/apache/tika/gui/ParsingTransferHandler.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/gui/ParsingTransferHandler.java?rev=741674&r1=741673&r2=741674&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/gui/ParsingTransferHandler.java (original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/gui/ParsingTransferHandler.java Fri Feb 6 18:51:10 2009
@@ -54,7 +54,8 @@
public boolean canImport(JComponent component, DataFlavor[] flavors) {
for (DataFlavor flavor : flavors) {
- if (flavor.equals(DataFlavor.javaFileListFlavor) || flavor.equals(uriListFlavor)) {
+ if (flavor.equals(DataFlavor.javaFileListFlavor)
+ || flavor.equals(uriListFlavor)) {
return true;
}
}
Modified: lucene/tika/trunk/src/main/java/org/apache/tika/parser/audio/AudioParser.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/audio/AudioParser.java?rev=741674&r1=741673&r2=741674&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/parser/audio/AudioParser.java (original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/parser/audio/AudioParser.java Fri Feb 6 18:51:10 2009
@@ -16,11 +16,11 @@
*/
package org.apache.tika.parser.audio;
+import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Map.Entry;
-import javax.sound.sampled.AudioFileFormat;
import javax.sound.sampled.AudioFormat;
import javax.sound.sampled.AudioSystem;
import javax.sound.sampled.UnsupportedAudioFileException;
@@ -34,52 +34,55 @@
public class AudioParser implements Parser {
- public void parse(InputStream stream, ContentHandler handler,
- Metadata metadata) throws IOException, SAXException, TikaException {
- parse(stream, metadata);
+ public void parse(
+ InputStream stream, ContentHandler handler, Metadata metadata)
+ throws IOException, SAXException, TikaException {
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
- xhtml.endDocument();
- }
- public void parse(InputStream stream, Metadata metadata)
- throws IOException, TikaException {
- String type = metadata.get(Metadata.CONTENT_TYPE);
- if (type != null) {
- try {
-
- AudioFileFormat fileFormat = AudioSystem
- .getAudioFileFormat(stream);
-
- AudioFormat format = fileFormat.getFormat();
-
- metadata.set("samplerate", Integer.toString((int) format
- .getSampleRate()));
- metadata
- .set("channels", Integer.toString(format.getChannels()));
- metadata.set("bits", Integer.toString(format
- .getSampleSizeInBits()));
- metadata.set("encoding", format.getEncoding().toString());
-
- // Javadoc suggests that some of the following properties might
- // be available, but I had no success in finding any:
-
- // "duration" Long playback duration of the file in microseconds
- // "author" String name of the author of this file
- // "title" String title of this file
- // "copyright" String copyright message
- // "date" Date date of the recording or release
- // "comment" String an arbitrary text
-
- for (Entry<String, Object> entry : format.properties()
- .entrySet()) {
- metadata.set(entry.getKey(), entry.getValue().toString());
- }
+ // AudioSystem expects the stream to support the mark feature
+ InputStream buffered = new BufferedInputStream(stream);
+ try {
+ AudioFormat format =
+ AudioSystem.getAudioFileFormat(buffered).getFormat();
+
+ float rate = format.getSampleRate();
+ if (rate != AudioSystem.NOT_SPECIFIED) {
+ metadata.set("samplerate", String.valueOf(rate));
+ }
+
+ int channels = format.getChannels();
+ if (channels != AudioSystem.NOT_SPECIFIED) {
+ metadata.set("channels", String.valueOf(channels));
+ }
- } catch (UnsupportedAudioFileException e) {
- // cannot parse, unknown format
+ int bits = format.getSampleSizeInBits();
+ if (bits != AudioSystem.NOT_SPECIFIED) {
+ metadata.set("bits", String.valueOf(bits));
}
+ metadata.set("encoding", format.getEncoding().toString());
+
+ // Javadoc suggests that some of the following properties might
+ // be available, but I had no success in finding any:
+
+ // "duration" Long playback duration of the file in microseconds
+ // "author" String name of the author of this file
+ // "title" String title of this file
+ // "copyright" String copyright message
+ // "date" Date date of the recording or release
+ // "comment" String an arbitrary text
+
+ for (Entry<String, Object> entry : format.properties().entrySet()) {
+ metadata.set(entry.getKey(), entry.getValue().toString());
+ }
+ } catch (UnsupportedAudioFileException e) {
+ // There is no way to know whether this exception was
+ // caused by the document being corrupted or by the format
+ // just being unsupported. So we do nothing.
}
+
+ xhtml.endDocument();
}
+
}
Modified: lucene/tika/trunk/src/main/java/org/apache/tika/parser/audio/MidiParser.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/audio/MidiParser.java?rev=741674&r1=741673&r2=741674&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/parser/audio/MidiParser.java (original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/parser/audio/MidiParser.java Fri Feb 6 18:51:10 2009
@@ -16,13 +16,15 @@
*/
package org.apache.tika.parser.audio;
+import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
-import java.util.HashMap;
import javax.sound.midi.InvalidMidiDataException;
import javax.sound.midi.MidiSystem;
+import javax.sound.midi.Patch;
import javax.sound.midi.Sequence;
+import javax.sound.midi.Track;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
@@ -33,46 +35,44 @@
public class MidiParser implements Parser {
- public void parse(InputStream stream, ContentHandler handler,
- Metadata metadata) throws IOException, SAXException, TikaException {
- parse(stream, metadata);
+ public void parse(
+ InputStream stream, ContentHandler handler, Metadata metadata)
+ throws IOException, SAXException, TikaException {
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
- xhtml.endDocument();
- }
-
- private static HashMap<Float, String> divisionTypes = new HashMap<Float, String>();
-
- static {
- divisionTypes.put(Sequence.PPQ, "PRQ");
- divisionTypes.put(Sequence.SMPTE_24, "SMPTE_24");
- divisionTypes.put(Sequence.SMPTE_25, "SMPTE_25");
- divisionTypes.put(Sequence.SMPTE_30, "SMPTE_30");
- divisionTypes.put(Sequence.SMPTE_30DROP, "SMPTE_30DROP");
- }
-
- public void parse(InputStream stream, Metadata metadata)
- throws IOException, TikaException {
- String type = metadata.get(Metadata.CONTENT_TYPE);
- if (type != null) {
-
- try {
-
- Sequence sequence = MidiSystem.getSequence(stream);
- metadata.set("tracks", Integer
- .toString(sequence.getTracks().length));
-
- metadata.set("patches", Integer.toString(sequence
- .getPatchList().length));
-
- metadata.set("divisionType", divisionTypes.get(sequence
- .getDivisionType()));
-
- } catch (InvalidMidiDataException e) {
- // cannot parse format
+ // MidiSystem expects the stream to support the mark feature
+ InputStream buffered = new BufferedInputStream(stream);
+ try {
+ Sequence sequence = MidiSystem.getSequence(buffered);
+
+ Track[] tracks = sequence.getTracks();
+ metadata.set("tracks", String.valueOf(tracks.length));
+
+ Patch[] patches = sequence.getPatchList();
+ metadata.set("patches", String.valueOf(patches.length));
+
+ float type = sequence.getDivisionType();
+ if (type == Sequence.PPQ) {
+ metadata.set("divisionType", "PPQ");
+ } else if (type == Sequence.SMPTE_24) {
+ metadata.set("divisionType", "SMPTE_24");
+ } else if (type == Sequence.SMPTE_25) {
+ metadata.set("divisionType", "SMPTE_25");
+ } else if (type == Sequence.SMPTE_30) {
+ metadata.set("divisionType", "SMPTE_30");
+ } else if (type == Sequence.SMPTE_30DROP) {
+ metadata.set("divisionType", "SMPTE_30DROP");
+ } else if (type == Sequence.SMPTE_24) {
+ metadata.set("divisionType", String.valueOf(type));
}
-
+ } catch (InvalidMidiDataException ignore) {
+ // There is no way to know whether this exception was
+ // caused by the document being corrupted or by the format
+ // just being unsupported. So we do nothing.
}
+
+ xhtml.endDocument();
}
+
}