You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by je...@apache.org on 2006/02/18 01:23:41 UTC
svn commit: r378667 - in /lucene/nutch/trunk/src/plugin/parse-mp3/src:
java/org/apache/nutch/parse/mp3/MP3Parser.java
java/org/apache/nutch/parse/mp3/MetadataCollector.java
test/org/apache/nutch/parse/mp3/TestMP3Parser.java
Author: jerome
Date: Fri Feb 17 16:23:35 2006
New Revision: 378667
URL: http://svn.apache.org/viewcvs?rev=378667&view=rev
Log:
Adapts parse-mp3 to nutch APIs changes (metadata, parse, protocol, ...)
Modified:
lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MP3Parser.java
lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MetadataCollector.java
lucene/nutch/trunk/src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java
Modified: lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MP3Parser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MP3Parser.java?rev=378667&r1=378666&r2=378667&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MP3Parser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MP3Parser.java Fri Feb 17 16:23:35 2006
@@ -16,10 +16,14 @@
package org.apache.nutch.parse.mp3;
+// JDK imports
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.util.Iterator;
-import org.apache.nutch.parse.*;
-import org.apache.nutch.protocol.Content;
-import org.apache.hadoop.conf.Configuration;
+// Java ID3 Tag imports
import org.farng.mp3.MP3File;
import org.farng.mp3.TagException;
import org.farng.mp3.id3.AbstractID3v2;
@@ -27,29 +31,35 @@
import org.farng.mp3.id3.ID3v1;
import org.farng.mp3.object.AbstractMP3Object;
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.net.MalformedURLException;
-import java.util.Iterator;
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+
+// Nutch imports
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.parse.Parser;
+import org.apache.nutch.protocol.Content;
+
/**
* A parser for MP3 audio files
* @author Andy Hedges
*/
-
public class MP3Parser implements Parser {
private MetadataCollector metadataCollector;
private Configuration conf;
- public Parse getParse(Content content) throws ParseException {
- Parse parse = null;
- metadataCollector.putAll(content.getMetadata());
+ public Parse getParse(Content content) {
+ Parse parse = null;
byte[] raw = content.getContent();
-
File tmp = null;
+
try {
tmp = File.createTempFile("nutch", ".mp3");
FileOutputStream fos = new FileOutputStream(tmp);
@@ -58,25 +68,31 @@
MP3File mp3 = new MP3File(tmp);
if (mp3.hasID3v2Tag()) {
- parse = getID3v2Parse(mp3);
+ parse = getID3v2Parse(mp3, content.getMetadata());
} else if (mp3.hasID3v1Tag()) {
- parse = getID3v1Parse(mp3);
+ parse = getID3v1Parse(mp3, content.getMetadata());
} else {
- throw new ParseException("No textual content available");
+ return new ParseStatus(ParseStatus.FAILED,
+ ParseStatus.FAILED_MISSING_CONTENT,
+ "No textual content available").getEmptyParse(conf);
}
-
-
} catch (IOException e) {
- throw new ParseException("Couldn't create temporary file", e);
+ return new ParseStatus(ParseStatus.FAILED,
+ ParseStatus.FAILED_EXCEPTION,
+ "Couldn't create temporary file:" + e).getEmptyParse(conf);
} catch (TagException e) {
- throw new ParseException("ID3 Tags could not be parsed", e);
+ return new ParseStatus(ParseStatus.FAILED,
+ ParseStatus.FAILED_EXCEPTION,
+ "ID3 Tags could not be parsed:" + e).getEmptyParse(conf);
} finally{
tmp.delete();
}
return parse;
}
- private Parse getID3v1Parse(MP3File mp3) throws MalformedURLException {
+ private Parse getID3v1Parse(MP3File mp3, Metadata contentMeta)
+ throws MalformedURLException {
+
ID3v1 tag = mp3.getID3v1Tag();
metadataCollector.notifyProperty("TALB-Text", tag.getAlbum());
metadataCollector.notifyProperty("TPE1-Text", tag.getArtist());
@@ -84,13 +100,17 @@
metadataCollector.notifyProperty("TCON-Text", "(" + tag.getGenre() + ")");
metadataCollector.notifyProperty("TIT2-Text", tag.getTitle());
metadataCollector.notifyProperty("TYER-Text", tag.getYear());
- ParseData parseData = new ParseData(metadataCollector.getTitle(),
- metadataCollector.getOutlinks(),
- metadataCollector.getData(), getConf());
+ ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS,
+ metadataCollector.getTitle(),
+ metadataCollector.getOutlinks(),
+ contentMeta,
+ metadataCollector.getData());
return new ParseImpl(metadataCollector.getText(), parseData);
}
- public Parse getID3v2Parse(MP3File mp3) throws IOException {
+ public Parse getID3v2Parse(MP3File mp3, Metadata contentMeta)
+ throws IOException {
+
AbstractID3v2 tag = mp3.getID3v2Tag();
Iterator it = tag.iterator();
while (it.hasNext()) {
@@ -108,9 +128,11 @@
}
}
}
- ParseData parseData = new ParseData(metadataCollector.getTitle(),
- metadataCollector.getOutlinks(),
- metadataCollector.getData());
+ ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS,
+ metadataCollector.getTitle(),
+ metadataCollector.getOutlinks(),
+ contentMeta,
+ metadataCollector.getData());
return new ParseImpl(metadataCollector.getText(), parseData);
}
Modified: lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MetadataCollector.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MetadataCollector.java?rev=378667&r1=378666&r2=378667&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MetadataCollector.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-mp3/src/java/org/apache/nutch/parse/mp3/MetadataCollector.java Fri Feb 17 16:23:35 2006
@@ -21,7 +21,7 @@
import java.net.MalformedURLException;
import java.util.ArrayList;
-import java.util.Properties;
+import org.apache.nutch.metadata.Metadata;
/**
* This class allows meta data to be collected and manipulated
@@ -29,7 +29,7 @@
*/
public class MetadataCollector {
- private Properties metadata = new Properties();
+ private Metadata metadata = new Metadata();
private String title = null;
private String artist = null;
private String album = null;
@@ -55,14 +55,10 @@
text += value + "\n";
}
- metadata.setProperty(name, value);
+ metadata.set(name, value);
}
- public void putAll(Properties properties) {
- metadata.putAll(properties);
- }
-
- public Properties getData() {
+ public Metadata getData() {
return metadata;
}
Modified: lucene/nutch/trunk/src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java?rev=378667&r1=378666&r2=378667&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java Fri Feb 17 16:23:35 2006
@@ -27,6 +27,12 @@
import org.apache.nutch.protocol.ProtocolFactory;
import java.util.Properties;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.UTF8;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Parser;
+import org.apache.nutch.util.NutchConfiguration;
/**
* Unit tests for TestMP3Parser. (Adapted from John Xing msword unit tests).
@@ -62,22 +68,23 @@
Content content;
Parse parse;
+ Configuration conf = NutchConfiguration.create();
urlString = "file:" + sampleDir + fileSeparator + id3v2;
- protocol = ProtocolFactory.getProtocol(urlString);
- content = protocol.getContent(urlString);
-
- parse = ParseUtil.parseByParserId("parse-mp3",content);
- Properties metadata = parse.getData().getMetadata();
- assertEquals("postgresql comment id3v2", metadata.getProperty("COMM-Text"));
- assertEquals("postgresql composer id3v2", metadata.getProperty("TCOM-Text"));
- assertEquals("02", metadata.getProperty("TRCK-Text"));
- assertEquals("http://localhost/", metadata.getProperty("WCOP-URL Link"));
- assertEquals("postgresql artist id3v2", metadata.getProperty("TPE1-Text"));
- assertEquals("(28)", metadata.getProperty("TCON-Text"));
- assertEquals("2004", metadata.getProperty("TYER-Text"));
- assertEquals("postgresql title id3v2", metadata.getProperty("TIT2-Text"));
- assertEquals("postgresql album id3v2", metadata.getProperty("TALB-Text"));
- assertEquals("postgresql encoded by id3v2", metadata.getProperty("TENC-Text"));
+ protocol = new ProtocolFactory(conf).getProtocol(urlString);
+ content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum())
+ .getContent();
+ parse = new ParseUtil(conf).parseByParserId("parse-mp3", content);
+ Metadata metadata = parse.getData().getParseMeta();
+ assertEquals("postgresql comment id3v2", metadata.get("COMM-Text"));
+ assertEquals("postgresql composer id3v2", metadata.get("TCOM-Text"));
+ assertEquals("02", metadata.get("TRCK-Text"));
+ assertEquals("http://localhost/", metadata.get("WCOP-URL Link"));
+ assertEquals("postgresql artist id3v2", metadata.get("TPE1-Text"));
+ assertEquals("(28)", metadata.get("TCON-Text"));
+ assertEquals("2004", metadata.get("TYER-Text"));
+ assertEquals("postgresql title id3v2", metadata.get("TIT2-Text"));
+ assertEquals("postgresql album id3v2", metadata.get("TALB-Text"));
+ assertEquals("postgresql encoded by id3v2", metadata.get("TENC-Text"));
assertEquals("postgresql title id3v2 - "
+ "postgresql album id3v2 - "
@@ -91,22 +98,22 @@
String urlString;
Protocol protocol;
Content content;
- Parser parser;
Parse parse;
+ Configuration conf = NutchConfiguration.create();
urlString = "file:" + sampleDir + fileSeparator + id3v1;
- protocol = ProtocolFactory.getProtocol(urlString);
- content = protocol.getContent(urlString);
- parser = ParserFactory.getParser(content.getContentType(), urlString);
- parse = parser.getParse(content);
-
- Properties metadata = parse.getData().getMetadata();
- assertEquals("postgresql comment id3v1", metadata.getProperty("COMM-Text"));
- assertEquals("postgresql artist id3v1", metadata.getProperty("TPE1-Text"));
- assertEquals("(28)", metadata.getProperty("TCON-Text"));
- assertEquals("2004", metadata.getProperty("TYER-Text"));
- assertEquals("postgresql title id3v1", metadata.getProperty("TIT2-Text"));
- assertEquals("postgresql album id3v1", metadata.getProperty("TALB-Text"));
+ protocol = new ProtocolFactory(conf).getProtocol(urlString);
+ content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum())
+ .getContent();
+ parse = new ParseUtil(conf).parseByParserId("parse-mp3", content);
+
+ Metadata metadata = parse.getData().getParseMeta();
+ assertEquals("postgresql comment id3v1", metadata.get("COMM-Text"));
+ assertEquals("postgresql artist id3v1", metadata.get("TPE1-Text"));
+ assertEquals("(28)", metadata.get("TCON-Text"));
+ assertEquals("2004", metadata.get("TYER-Text"));
+ assertEquals("postgresql title id3v1", metadata.get("TIT2-Text"));
+ assertEquals("postgresql album id3v1", metadata.get("TALB-Text"));
assertEquals("postgresql title id3v1 - "
+ "postgresql album id3v1 - "
@@ -118,21 +125,18 @@
String urlString;
Protocol protocol;
Content content;
- Parser parser;
Parse parse;
+ Configuration conf = NutchConfiguration.create();
urlString = "file:" + sampleDir + fileSeparator + none;
- protocol = ProtocolFactory.getProtocol(urlString);
- content = protocol.getContent(urlString);
- parser = ParserFactory.getParser(content.getContentType(), urlString);
- try {
- parse = parser.getParse(content);
- Properties metadata = parse.getData().getMetadata();
- } catch (ParseException e) {
- return;
+ protocol = new ProtocolFactory(conf).getProtocol(urlString);
+ content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum())
+ .getContent();
+ parse = new ParseUtil(conf).parseByParserId("parse-mp3", content);
+ Metadata metadata = parse.getData().getParseMeta();
+ if (parse.getData().getStatus().isSuccess()) {
+ fail("Expected ParseException");
}
- fail("Expected ParseException");
-
}
}