You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by je...@apache.org on 2006/03/29 10:58:05 UTC

svn commit: r389712 - /lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java

Author: jerome
Date: Wed Mar 29 00:58:02 2006
New Revision: 389712

URL: http://svn.apache.org/viewcvs?rev=389712&view=rev
Log:
main method added to RSSParser

Modified:
    lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java

Modified: lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java?rev=389712&r1=389711&r2=389712&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java Wed Mar 29 00:58:02 2006
@@ -16,32 +16,40 @@
 
 package org.apache.nutch.parse.rss;
 
-import org.apache.nutch.protocol.Content;
+// JDK imports
+import java.io.ByteArrayInputStream;
+import java.net.MalformedURLException;
+import java.util.logging.Logger;
+import java.util.List;
+import java.util.Vector;
+import java.util.logging.Level;
+
+// Hadoop imports
+import org.apache.hadoop.io.UTF8;
 import org.apache.hadoop.util.LogFormatter;
 import org.apache.hadoop.conf.Configuration;
+
+// Nutch imports
+import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.parse.Parser;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseStatus;
 import org.apache.nutch.parse.ParseData;
 import org.apache.nutch.parse.ParseImpl;
 import org.apache.nutch.parse.Outlink;
-
 import org.apache.nutch.parse.rss.structs.RSSItem;
 import org.apache.nutch.parse.rss.structs.RSSChannel;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.util.NutchConfiguration;
 
-import java.io.ByteArrayInputStream;
-
-import java.net.MalformedURLException;
-
-import java.util.logging.Logger;
-import java.util.List;
-import java.util.Vector;
-
-// add all the RSS parsing imports right here
+// RSS parsing imports
 import org.apache.commons.feedparser.FeedParserListener;
 import org.apache.commons.feedparser.FeedParser;
 import org.apache.commons.feedparser.FeedParserFactory;
 
+
 /**
  * 
  * @author mattmann
@@ -214,5 +222,19 @@
   public Configuration getConf() {
     return this.conf;
   }
+  
+  public static void main(String[] args) throws Exception {
+    LOG.setLevel(Level.FINE);
+    String url = args[0];
+    Configuration conf = NutchConfiguration.create();
+    RSSParser parser = new RSSParser();
+    parser.setConf(conf);
+    Protocol protocol = new ProtocolFactory(conf).getProtocol(url);
+    Content content = protocol.getProtocolOutput(new UTF8(url), new CrawlDatum()).getContent();
+    Parse parse = parser.getParse(content);
+    System.out.println("data: "+ parse.getData());
+    System.out.println("text: "+parse.getText());
+  }
+  
 
 }