You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by je...@apache.org on 2006/03/29 10:58:05 UTC
svn commit: r389712 -
/lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java
Author: jerome
Date: Wed Mar 29 00:58:02 2006
New Revision: 389712
URL: http://svn.apache.org/viewcvs?rev=389712&view=rev
Log:
main method added to RSSParser
Modified:
lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java
Modified: lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java?rev=389712&r1=389711&r2=389712&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java Wed Mar 29 00:58:02 2006
@@ -16,32 +16,40 @@
package org.apache.nutch.parse.rss;
-import org.apache.nutch.protocol.Content;
+// JDK imports
+import java.io.ByteArrayInputStream;
+import java.net.MalformedURLException;
+import java.util.logging.Logger;
+import java.util.List;
+import java.util.Vector;
+import java.util.logging.Level;
+
+// Hadoop imports
+import org.apache.hadoop.io.UTF8;
import org.apache.hadoop.util.LogFormatter;
import org.apache.hadoop.conf.Configuration;
+
+// Nutch imports
+import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.parse.Parser;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.parse.Outlink;
-
import org.apache.nutch.parse.rss.structs.RSSItem;
import org.apache.nutch.parse.rss.structs.RSSChannel;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.util.NutchConfiguration;
-import java.io.ByteArrayInputStream;
-
-import java.net.MalformedURLException;
-
-import java.util.logging.Logger;
-import java.util.List;
-import java.util.Vector;
-
-// add all the RSS parsing imports right here
+// RSS parsing imports
import org.apache.commons.feedparser.FeedParserListener;
import org.apache.commons.feedparser.FeedParser;
import org.apache.commons.feedparser.FeedParserFactory;
+
/**
*
* @author mattmann
@@ -214,5 +222,19 @@
public Configuration getConf() {
return this.conf;
}
+
+ public static void main(String[] args) throws Exception {
+ LOG.setLevel(Level.FINE);
+ String url = args[0];
+ Configuration conf = NutchConfiguration.create();
+ RSSParser parser = new RSSParser();
+ parser.setConf(conf);
+ Protocol protocol = new ProtocolFactory(conf).getProtocol(url);
+ Content content = protocol.getProtocolOutput(new UTF8(url), new CrawlDatum()).getContent();
+ Parse parse = parser.getParse(content);
+ System.out.println("data: "+ parse.getData());
+ System.out.println("text: "+parse.getText());
+ }
+
}