You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2009/02/06 19:51:11 UTC

svn commit: r741674 - in /lucene/tika/trunk: pom.xml src/main/java/org/apache/tika/gui/ParsingTransferHandler.java src/main/java/org/apache/tika/parser/audio/AudioParser.java src/main/java/org/apache/tika/parser/audio/MidiParser.java

Author: jukka
Date: Fri Feb  6 18:51:10 2009
New Revision: 741674

URL: http://svn.apache.org/viewvc?rev=741674&view=rev
Log:
TIKA-199: Improved audio detection and parsing

Streamlined the audio and midi parsers. Made sure that both use a BufferedInputStream wrapper as the native javax.sound parsers expect the given input stream to support the mark feature.

Modified:
    lucene/tika/trunk/pom.xml
    lucene/tika/trunk/src/main/java/org/apache/tika/gui/ParsingTransferHandler.java
    lucene/tika/trunk/src/main/java/org/apache/tika/parser/audio/AudioParser.java
    lucene/tika/trunk/src/main/java/org/apache/tika/parser/audio/MidiParser.java

Modified: lucene/tika/trunk/pom.xml
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/pom.xml?rev=741674&r1=741673&r2=741674&view=diff
==============================================================================
--- lucene/tika/trunk/pom.xml (original)
+++ lucene/tika/trunk/pom.xml Fri Feb  6 18:51:10 2009
@@ -193,7 +193,7 @@
     <dependency>
       <groupId>commons-io</groupId>
       <artifactId>commons-io</artifactId>
-      <version>1.4</version>
+      <version>1.5-SNAPSHOT</version>
     </dependency>
     <dependency>
       <groupId>pdfbox</groupId>

Modified: lucene/tika/trunk/src/main/java/org/apache/tika/gui/ParsingTransferHandler.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/gui/ParsingTransferHandler.java?rev=741674&r1=741673&r2=741674&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/gui/ParsingTransferHandler.java (original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/gui/ParsingTransferHandler.java Fri Feb  6 18:51:10 2009
@@ -54,7 +54,8 @@
 
     public boolean canImport(JComponent component, DataFlavor[] flavors) {
         for (DataFlavor flavor : flavors) {
-            if (flavor.equals(DataFlavor.javaFileListFlavor) || flavor.equals(uriListFlavor)) {
+            if (flavor.equals(DataFlavor.javaFileListFlavor)
+                    || flavor.equals(uriListFlavor)) {
                 return true;
             }
         }

Modified: lucene/tika/trunk/src/main/java/org/apache/tika/parser/audio/AudioParser.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/audio/AudioParser.java?rev=741674&r1=741673&r2=741674&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/parser/audio/AudioParser.java (original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/parser/audio/AudioParser.java Fri Feb  6 18:51:10 2009
@@ -16,11 +16,11 @@
  */
 package org.apache.tika.parser.audio;
 
+import java.io.BufferedInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.Map.Entry;
 
-import javax.sound.sampled.AudioFileFormat;
 import javax.sound.sampled.AudioFormat;
 import javax.sound.sampled.AudioSystem;
 import javax.sound.sampled.UnsupportedAudioFileException;
@@ -34,52 +34,55 @@
 
 public class AudioParser implements Parser {
 
-    public void parse(InputStream stream, ContentHandler handler,
-            Metadata metadata) throws IOException, SAXException, TikaException {
-        parse(stream, metadata);
+    public void parse(
+            InputStream stream, ContentHandler handler, Metadata metadata)
+            throws IOException, SAXException, TikaException {
         XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
         xhtml.startDocument();
-        xhtml.endDocument();
-    }
 
-    public void parse(InputStream stream, Metadata metadata)
-            throws IOException, TikaException {
-        String type = metadata.get(Metadata.CONTENT_TYPE);
-        if (type != null) {
-            try {
-
-                AudioFileFormat fileFormat = AudioSystem
-                        .getAudioFileFormat(stream);
-
-                AudioFormat format = fileFormat.getFormat();
-
-                metadata.set("samplerate", Integer.toString((int) format
-                        .getSampleRate()));
-                metadata
-                        .set("channels", Integer.toString(format.getChannels()));
-                metadata.set("bits", Integer.toString(format
-                        .getSampleSizeInBits()));
-                metadata.set("encoding", format.getEncoding().toString());
-
-                // Javadoc suggests that some of the following properties might
-                // be available, but I had no success in finding any:
-
-                // "duration" Long playback duration of the file in microseconds
-                // "author" String name of the author of this file
-                // "title" String title of this file
-                // "copyright" String copyright message
-                // "date" Date date of the recording or release
-                // "comment" String an arbitrary text
-
-                for (Entry<String, Object> entry : format.properties()
-                        .entrySet()) {
-                    metadata.set(entry.getKey(), entry.getValue().toString());
-                }
+        // AudioSystem expects the stream to support the mark feature
+        InputStream buffered = new BufferedInputStream(stream);
+        try {
+            AudioFormat format =
+                AudioSystem.getAudioFileFormat(buffered).getFormat();
+
+            float rate = format.getSampleRate();
+            if (rate != AudioSystem.NOT_SPECIFIED) {
+                metadata.set("samplerate", String.valueOf(rate));
+            }
+
+            int channels = format.getChannels();
+            if (channels != AudioSystem.NOT_SPECIFIED) {
+                metadata.set("channels", String.valueOf(channels));
+            }
 
-            } catch (UnsupportedAudioFileException e) {
-                // cannot parse, unknown format
+            int bits = format.getSampleSizeInBits();
+            if (bits != AudioSystem.NOT_SPECIFIED) {
+                metadata.set("bits", String.valueOf(bits));
             }
 
+            metadata.set("encoding", format.getEncoding().toString());
+
+            // Javadoc suggests that some of the following properties might
+            // be available, but I had no success in finding any:
+
+            // "duration" Long playback duration of the file in microseconds
+            // "author" String name of the author of this file
+            // "title" String title of this file
+            // "copyright" String copyright message
+            // "date" Date date of the recording or release
+            // "comment" String an arbitrary text
+
+            for (Entry<String, Object> entry : format.properties().entrySet()) {
+                metadata.set(entry.getKey(), entry.getValue().toString());
+            }
+        } catch (UnsupportedAudioFileException e) {
+            // There is no way to know whether this exception was
+            // caused by the document being corrupted or by the format
+            // just being unsupported. So we do nothing.
         }
+
+        xhtml.endDocument();
     }
+
 }

Modified: lucene/tika/trunk/src/main/java/org/apache/tika/parser/audio/MidiParser.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/src/main/java/org/apache/tika/parser/audio/MidiParser.java?rev=741674&r1=741673&r2=741674&view=diff
==============================================================================
--- lucene/tika/trunk/src/main/java/org/apache/tika/parser/audio/MidiParser.java (original)
+++ lucene/tika/trunk/src/main/java/org/apache/tika/parser/audio/MidiParser.java Fri Feb  6 18:51:10 2009
@@ -16,13 +16,15 @@
  */
 package org.apache.tika.parser.audio;
 
+import java.io.BufferedInputStream;
 import java.io.IOException;
 import java.io.InputStream;
-import java.util.HashMap;
 
 import javax.sound.midi.InvalidMidiDataException;
 import javax.sound.midi.MidiSystem;
+import javax.sound.midi.Patch;
 import javax.sound.midi.Sequence;
+import javax.sound.midi.Track;
 
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
@@ -33,46 +35,44 @@
 
 public class MidiParser implements Parser {
 
-    public void parse(InputStream stream, ContentHandler handler,
-            Metadata metadata) throws IOException, SAXException, TikaException {
-        parse(stream, metadata);
+    public void parse(
+            InputStream stream, ContentHandler handler, Metadata metadata)
+            throws IOException, SAXException, TikaException {
         XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
         xhtml.startDocument();
-        xhtml.endDocument();
-    }
-
-    private static HashMap<Float, String> divisionTypes = new HashMap<Float, String>();
-
-    static {
-        divisionTypes.put(Sequence.PPQ, "PRQ");
-        divisionTypes.put(Sequence.SMPTE_24, "SMPTE_24");
-        divisionTypes.put(Sequence.SMPTE_25, "SMPTE_25");
-        divisionTypes.put(Sequence.SMPTE_30, "SMPTE_30");
-        divisionTypes.put(Sequence.SMPTE_30DROP, "SMPTE_30DROP");
-    }
-
-    public void parse(InputStream stream, Metadata metadata)
-            throws IOException, TikaException {
-        String type = metadata.get(Metadata.CONTENT_TYPE);
-        if (type != null) {
-
-            try {
-
-                Sequence sequence = MidiSystem.getSequence(stream);
 
-                metadata.set("tracks", Integer
-                        .toString(sequence.getTracks().length));
-
-                metadata.set("patches", Integer.toString(sequence
-                        .getPatchList().length));
-
-                metadata.set("divisionType", divisionTypes.get(sequence
-                        .getDivisionType()));
-
-            } catch (InvalidMidiDataException e) {
-                // cannot parse format
+        // MidiSystem expects the stream to support the mark feature
+        InputStream buffered = new BufferedInputStream(stream);
+        try {
+            Sequence sequence = MidiSystem.getSequence(buffered);
+
+            Track[] tracks = sequence.getTracks();
+            metadata.set("tracks", String.valueOf(tracks.length));
+
+            Patch[] patches = sequence.getPatchList();
+            metadata.set("patches", String.valueOf(patches.length));
+
+            float type = sequence.getDivisionType();
+            if (type == Sequence.PPQ) {
+                metadata.set("divisionType", "PPQ");
+            } else if (type == Sequence.SMPTE_24) {
+                metadata.set("divisionType", "SMPTE_24");
+            } else if (type == Sequence.SMPTE_25) {
+                metadata.set("divisionType", "SMPTE_25");
+            } else if (type == Sequence.SMPTE_30) {
+                metadata.set("divisionType", "SMPTE_30");
+            } else if (type == Sequence.SMPTE_30DROP) {
+                metadata.set("divisionType", "SMPTE_30DROP");
+            } else if (type == Sequence.SMPTE_24) {
+                metadata.set("divisionType", String.valueOf(type));
             }
-
+        } catch (InvalidMidiDataException ignore) {
+            // There is no way to know whether this exception was
+            // caused by the document being corrupted or by the format
+            // just being unsupported. So we do nothing.
         }
+
+        xhtml.endDocument();
     }
+
 }