You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by je...@apache.org on 2006/02/18 01:23:41 UTC

svn commit: r378667 - in /lucene/nutch/trunk/src/plugin/parse-mp3/src: java/org/apache/nutch/parse/mp3/MP3Parser.java java/org/apache/nutch/parse/mp3/MetadataCollector.java test/org/apache/nutch/parse/mp3/TestMP3Parser.java

Author: jerome
Date: Fri Feb 17 16:23:35 2006
New Revision: 378667

URL: http://svn.apache.org/viewcvs?rev=378667&view=rev
Log:
Adapts parse-mp3 to nutch APIs changes (metadata, parse, protocol, ...)

Modified:
    lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MP3Parser.java
    lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MetadataCollector.java
    lucene/nutch/trunk/src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java

Modified: lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MP3Parser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MP3Parser.java?rev=378667&r1=378666&r2=378667&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MP3Parser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MP3Parser.java Fri Feb 17 16:23:35 2006
@@ -16,10 +16,14 @@
 
 package org.apache.nutch.parse.mp3;
 
+// JDK imports
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.util.Iterator;
 
-import org.apache.nutch.parse.*;
-import org.apache.nutch.protocol.Content;
-import org.apache.hadoop.conf.Configuration;
+// Java ID3 Tag imports
 import org.farng.mp3.MP3File;
 import org.farng.mp3.TagException;
 import org.farng.mp3.id3.AbstractID3v2;
@@ -27,29 +31,35 @@
 import org.farng.mp3.id3.ID3v1;
 import org.farng.mp3.object.AbstractMP3Object;
 
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.net.MalformedURLException;
-import java.util.Iterator;
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+
+// Nutch imports
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.parse.Parser;
+import org.apache.nutch.protocol.Content;
+
 
 /**
  * A parser for MP3 audio files
  * @author Andy Hedges
  */
-
 public class MP3Parser implements Parser {
 
   private MetadataCollector metadataCollector;
   private Configuration conf;
 
-  public Parse getParse(Content content) throws ParseException {
-    Parse parse = null;
-    metadataCollector.putAll(content.getMetadata());
+  public Parse getParse(Content content) {
 
+    Parse parse = null;
     byte[] raw = content.getContent();
-
     File tmp = null;
+    
     try {
       tmp = File.createTempFile("nutch", ".mp3");
       FileOutputStream fos = new FileOutputStream(tmp);
@@ -58,25 +68,31 @@
       MP3File mp3 = new MP3File(tmp);
 
       if (mp3.hasID3v2Tag()) {
-        parse = getID3v2Parse(mp3);
+        parse = getID3v2Parse(mp3, content.getMetadata());
       } else if (mp3.hasID3v1Tag()) {
-        parse = getID3v1Parse(mp3);
+        parse = getID3v1Parse(mp3, content.getMetadata());
       } else {
-        throw new ParseException("No textual content available");
+        return new ParseStatus(ParseStatus.FAILED,
+                               ParseStatus.FAILED_MISSING_CONTENT,
+                               "No textual content available").getEmptyParse(conf);
       }
-
-
     } catch (IOException e) {
-      throw new ParseException("Couldn't create temporary file", e);
+      return new ParseStatus(ParseStatus.FAILED,
+                             ParseStatus.FAILED_EXCEPTION,
+                             "Couldn't create temporary file:" + e).getEmptyParse(conf);
     } catch (TagException e) {
-      throw new ParseException("ID3 Tags could not be parsed", e);
+      return new ParseStatus(ParseStatus.FAILED,
+                             ParseStatus.FAILED_EXCEPTION,
+                             "ID3 Tags could not be parsed:" + e).getEmptyParse(conf);
     } finally{
       tmp.delete();
     }
     return parse;
   }
 
-  private Parse getID3v1Parse(MP3File mp3) throws MalformedURLException {
+  private Parse getID3v1Parse(MP3File mp3, Metadata contentMeta)
+  throws MalformedURLException {
+
     ID3v1 tag = mp3.getID3v1Tag();
     metadataCollector.notifyProperty("TALB-Text", tag.getAlbum());
     metadataCollector.notifyProperty("TPE1-Text", tag.getArtist());
@@ -84,13 +100,17 @@
     metadataCollector.notifyProperty("TCON-Text", "(" + tag.getGenre() + ")");
     metadataCollector.notifyProperty("TIT2-Text", tag.getTitle());
     metadataCollector.notifyProperty("TYER-Text", tag.getYear());
-    ParseData parseData = new ParseData(metadataCollector.getTitle(),
-        metadataCollector.getOutlinks(),
-        metadataCollector.getData(), getConf());
+    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS,
+                                        metadataCollector.getTitle(),
+                                        metadataCollector.getOutlinks(),
+                                        contentMeta,
+                                        metadataCollector.getData());
     return new ParseImpl(metadataCollector.getText(), parseData);
   }
 
-  public Parse getID3v2Parse(MP3File mp3) throws IOException {
+  public Parse getID3v2Parse(MP3File mp3, Metadata contentMeta)
+  throws IOException {
+    
     AbstractID3v2 tag = mp3.getID3v2Tag();
     Iterator it = tag.iterator();
     while (it.hasNext()) {
@@ -108,9 +128,11 @@
         }
       }
     }
-    ParseData parseData = new ParseData(metadataCollector.getTitle(),
-        metadataCollector.getOutlinks(),
-        metadataCollector.getData());
+    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS,
+                                        metadataCollector.getTitle(),
+                                        metadataCollector.getOutlinks(),
+                                        contentMeta,
+                                        metadataCollector.getData());
     return new ParseImpl(metadataCollector.getText(), parseData);
   }
 

Modified: lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MetadataCollector.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MetadataCollector.java?rev=378667&r1=378666&r2=378667&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MetadataCollector.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MetadataCollector.java Fri Feb 17 16:23:35 2006
@@ -21,7 +21,7 @@
 
 import java.net.MalformedURLException;
 import java.util.ArrayList;
-import java.util.Properties;
+import org.apache.nutch.metadata.Metadata;
 
 /**
  * This class allows meta data to be collected and manipulated
@@ -29,7 +29,7 @@
  */
 public class MetadataCollector {
 
-  private Properties metadata = new Properties();
+  private Metadata metadata = new Metadata();
   private String title = null;
   private String artist = null;
   private String album = null;
@@ -55,14 +55,10 @@
       text += value + "\n";
     }
 
-    metadata.setProperty(name, value);
+    metadata.set(name, value);
   }
 
-  public void putAll(Properties properties) {
-    metadata.putAll(properties);
-  }
-
-  public Properties getData() {
+  public Metadata getData() {
     return metadata;
   }
 

Modified: lucene/nutch/trunk/src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java?rev=378667&r1=378666&r2=378667&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java Fri Feb 17 16:23:35 2006
@@ -27,6 +27,12 @@
 import org.apache.nutch.protocol.ProtocolFactory;
 
 import java.util.Properties;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.UTF8;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Parser;
+import org.apache.nutch.util.NutchConfiguration;
 
 /**
  * Unit tests for TestMP3Parser.  (Adapted from John Xing msword unit tests).
@@ -62,22 +68,23 @@
     Content content;
     Parse parse;
 
+    Configuration conf = NutchConfiguration.create();
     urlString = "file:" + sampleDir + fileSeparator + id3v2;
-    protocol = ProtocolFactory.getProtocol(urlString);
-    content = protocol.getContent(urlString);
-
-    parse = ParseUtil.parseByParserId("parse-mp3",content);
-    Properties metadata = parse.getData().getMetadata();
-    assertEquals("postgresql comment id3v2", metadata.getProperty("COMM-Text"));
-    assertEquals("postgresql composer id3v2", metadata.getProperty("TCOM-Text"));
-    assertEquals("02", metadata.getProperty("TRCK-Text"));
-    assertEquals("http://localhost/", metadata.getProperty("WCOP-URL Link"));
-    assertEquals("postgresql artist id3v2", metadata.getProperty("TPE1-Text"));
-    assertEquals("(28)", metadata.getProperty("TCON-Text"));
-    assertEquals("2004", metadata.getProperty("TYER-Text"));
-    assertEquals("postgresql title id3v2", metadata.getProperty("TIT2-Text"));
-    assertEquals("postgresql album id3v2", metadata.getProperty("TALB-Text"));
-    assertEquals("postgresql encoded by id3v2", metadata.getProperty("TENC-Text"));
+    protocol = new ProtocolFactory(conf).getProtocol(urlString);
+    content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum())
+                      .getContent();
+    parse = new ParseUtil(conf).parseByParserId("parse-mp3", content);
+    Metadata metadata = parse.getData().getParseMeta();
+    assertEquals("postgresql comment id3v2", metadata.get("COMM-Text"));
+    assertEquals("postgresql composer id3v2", metadata.get("TCOM-Text"));
+    assertEquals("02", metadata.get("TRCK-Text"));
+    assertEquals("http://localhost/", metadata.get("WCOP-URL Link"));
+    assertEquals("postgresql artist id3v2", metadata.get("TPE1-Text"));
+    assertEquals("(28)", metadata.get("TCON-Text"));
+    assertEquals("2004", metadata.get("TYER-Text"));
+    assertEquals("postgresql title id3v2", metadata.get("TIT2-Text"));
+    assertEquals("postgresql album id3v2", metadata.get("TALB-Text"));
+    assertEquals("postgresql encoded by id3v2", metadata.get("TENC-Text"));
 
     assertEquals("postgresql title id3v2 - "
         + "postgresql album id3v2 - "
@@ -91,22 +98,22 @@
     String urlString;
     Protocol protocol;
     Content content;
-    Parser parser;
     Parse parse;
 
+    Configuration conf = NutchConfiguration.create();
     urlString = "file:" + sampleDir + fileSeparator + id3v1;
-    protocol = ProtocolFactory.getProtocol(urlString);
-    content = protocol.getContent(urlString);
-    parser = ParserFactory.getParser(content.getContentType(), urlString);
-    parse = parser.getParse(content);
-
-    Properties metadata = parse.getData().getMetadata();
-    assertEquals("postgresql comment id3v1", metadata.getProperty("COMM-Text"));
-    assertEquals("postgresql artist id3v1", metadata.getProperty("TPE1-Text"));
-    assertEquals("(28)", metadata.getProperty("TCON-Text"));
-    assertEquals("2004", metadata.getProperty("TYER-Text"));
-    assertEquals("postgresql title id3v1", metadata.getProperty("TIT2-Text"));
-    assertEquals("postgresql album id3v1", metadata.getProperty("TALB-Text"));
+    protocol = new ProtocolFactory(conf).getProtocol(urlString);
+    content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum())
+                      .getContent();
+    parse = new ParseUtil(conf).parseByParserId("parse-mp3", content);
+
+    Metadata metadata = parse.getData().getParseMeta();
+    assertEquals("postgresql comment id3v1", metadata.get("COMM-Text"));
+    assertEquals("postgresql artist id3v1", metadata.get("TPE1-Text"));
+    assertEquals("(28)", metadata.get("TCON-Text"));
+    assertEquals("2004", metadata.get("TYER-Text"));
+    assertEquals("postgresql title id3v1", metadata.get("TIT2-Text"));
+    assertEquals("postgresql album id3v1", metadata.get("TALB-Text"));
 
     assertEquals("postgresql title id3v1 - "
         + "postgresql album id3v1 - "
@@ -118,21 +125,18 @@
     String urlString;
     Protocol protocol;
     Content content;
-    Parser parser;
     Parse parse;
 
+    Configuration conf = NutchConfiguration.create();
     urlString = "file:" + sampleDir + fileSeparator + none;
-    protocol = ProtocolFactory.getProtocol(urlString);
-    content = protocol.getContent(urlString);
-    parser = ParserFactory.getParser(content.getContentType(), urlString);
-    try {
-      parse = parser.getParse(content);
-      Properties metadata = parse.getData().getMetadata();
-    } catch (ParseException e) {
-      return;
+    protocol = new ProtocolFactory(conf).getProtocol(urlString);
+    content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum())
+                      .getContent();
+    parse = new ParseUtil(conf).parseByParserId("parse-mp3", content);
+    Metadata metadata = parse.getData().getParseMeta();
+    if (parse.getData().getStatus().isSuccess()) {
+      fail("Expected ParseException");
     }
-    fail("Expected ParseException");
-
   }
 
 }