You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2005/08/12 16:25:38 UTC

svn commit: r232303 - in /lucene/nutch/trunk/src/plugin: ./ parse-rss/ parse-rss/lib/ parse-rss/sample/ parse-rss/src/ parse-rss/src/java/ parse-rss/src/java/org/ parse-rss/src/java/org/apache/ parse-rss/src/java/org/apache/nutch/ parse-rss/src/java/or...

Author: ab
Date: Fri Aug 12 07:23:47 2005
New Revision: 232303

URL: http://svn.apache.org/viewcvs?rev=232303&view=rev
Log:
RSS Parse plugin. Contributed by Chris Mattmann (issue NUTCH-30).
Thank you!

Added:
    lucene/nutch/trunk/src/plugin/parse-rss/
    lucene/nutch/trunk/src/plugin/parse-rss/build.xml   (with props)
    lucene/nutch/trunk/src/plugin/parse-rss/lib/
    lucene/nutch/trunk/src/plugin/parse-rss/lib/commons-feedparser-0.6-fork.jar   (with props)
    lucene/nutch/trunk/src/plugin/parse-rss/lib/commons-httpclient-3.0-beta1.jar   (with props)
    lucene/nutch/trunk/src/plugin/parse-rss/lib/jaxen-core.jar   (with props)
    lucene/nutch/trunk/src/plugin/parse-rss/lib/jaxen-jdom.jar   (with props)
    lucene/nutch/trunk/src/plugin/parse-rss/lib/jdom.jar   (with props)
    lucene/nutch/trunk/src/plugin/parse-rss/lib/log4j-1.2.6.jar   (with props)
    lucene/nutch/trunk/src/plugin/parse-rss/lib/saxpath.jar   (with props)
    lucene/nutch/trunk/src/plugin/parse-rss/lib/xercesImpl.jar   (with props)
    lucene/nutch/trunk/src/plugin/parse-rss/lib/xml-apis.jar   (with props)
    lucene/nutch/trunk/src/plugin/parse-rss/lib/xmlrpc-1.2.jar   (with props)
    lucene/nutch/trunk/src/plugin/parse-rss/plugin.xml   (with props)
    lucene/nutch/trunk/src/plugin/parse-rss/sample/
    lucene/nutch/trunk/src/plugin/parse-rss/sample/rsstest.rss   (with props)
    lucene/nutch/trunk/src/plugin/parse-rss/src/
    lucene/nutch/trunk/src/plugin/parse-rss/src/java/
    lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/
    lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/
    lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/
    lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/
    lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/
    lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/FeedParserListenerImpl.java   (with props)
    lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java   (with props)
    lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/structs/
    lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/structs/RSSChannel.java   (with props)
    lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/structs/RSSItem.java   (with props)
    lucene/nutch/trunk/src/plugin/parse-rss/src/test/
    lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/
    lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/
    lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/
    lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/
    lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/
    lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java   (with props)
Modified:
    lucene/nutch/trunk/src/plugin/build.xml

Modified: lucene/nutch/trunk/src/plugin/build.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/build.xml?rev=232303&r1=232302&r2=232303&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/build.xml Fri Aug 12 07:23:47 2005
@@ -14,6 +14,7 @@
      <ant dir="parse-js" target="deploy"/>
      <ant dir="parse-text" target="deploy"/>
      <ant dir="parse-pdf" target="deploy"/>
+     <ant dir="parse-rss" target="deploy"/>
      <ant dir="parse-msword" target="deploy"/>
 <!-- <ant dir="parse-mp3" target="deploy"/> -->
 <!-- <ant dir="parse-rtf" target="deploy"/> -->
@@ -39,6 +40,7 @@
      <ant dir="protocol-http" target="test"/>
      <ant dir="parse-html" target="test"/>
      <ant dir="parse-pdf" target="test"/>
+     <ant dir="parse-rss" target="test"/>
      <ant dir="parse-msword" target="test"/>
  <!-- <ant dir="parse-mp3" target="test"/> -->
  <!-- <ant dir="parse-rtf" target="test"/> -->
@@ -60,6 +62,7 @@
     <ant dir="parse-js" target="clean"/>
     <ant dir="parse-text" target="clean"/>
     <ant dir="parse-pdf" target="clean"/>
+    <ant dir="parse-rss" target="clean"/>
     <ant dir="parse-msword" target="clean"/>
     <ant dir="parse-mp3" target="clean"/>
     <ant dir="parse-rtf" target="clean"/>

Added: lucene/nutch/trunk/src/plugin/parse-rss/build.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rss/build.xml?rev=232303&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-rss/build.xml (added)
+++ lucene/nutch/trunk/src/plugin/parse-rss/build.xml Fri Aug 12 07:23:47 2005
@@ -0,0 +1,13 @@
+<?xml version="1.0"?>
+
+<project name="parse-rss" default="jar">
+
+  <import file="../build-plugin.xml"/>
+  
+  
+  <!-- for junit test -->
+  <mkdir dir="${build.test}/data"/>
+  <copy file="sample/rsstest.rss" todir="${build.test}/data"/>
+  
+
+</project>

Propchange: lucene/nutch/trunk/src/plugin/parse-rss/build.xml
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/parse-rss/lib/commons-feedparser-0.6-fork.jar
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rss/lib/commons-feedparser-0.6-fork.jar?rev=232303&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/nutch/trunk/src/plugin/parse-rss/lib/commons-feedparser-0.6-fork.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/src/plugin/parse-rss/lib/commons-httpclient-3.0-beta1.jar
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rss/lib/commons-httpclient-3.0-beta1.jar?rev=232303&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/nutch/trunk/src/plugin/parse-rss/lib/commons-httpclient-3.0-beta1.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/src/plugin/parse-rss/lib/jaxen-core.jar
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rss/lib/jaxen-core.jar?rev=232303&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/nutch/trunk/src/plugin/parse-rss/lib/jaxen-core.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/src/plugin/parse-rss/lib/jaxen-jdom.jar
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rss/lib/jaxen-jdom.jar?rev=232303&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/nutch/trunk/src/plugin/parse-rss/lib/jaxen-jdom.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/src/plugin/parse-rss/lib/jdom.jar
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rss/lib/jdom.jar?rev=232303&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/nutch/trunk/src/plugin/parse-rss/lib/jdom.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/src/plugin/parse-rss/lib/log4j-1.2.6.jar
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rss/lib/log4j-1.2.6.jar?rev=232303&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/nutch/trunk/src/plugin/parse-rss/lib/log4j-1.2.6.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/src/plugin/parse-rss/lib/saxpath.jar
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rss/lib/saxpath.jar?rev=232303&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/nutch/trunk/src/plugin/parse-rss/lib/saxpath.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/src/plugin/parse-rss/lib/xercesImpl.jar
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rss/lib/xercesImpl.jar?rev=232303&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/nutch/trunk/src/plugin/parse-rss/lib/xercesImpl.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/src/plugin/parse-rss/lib/xml-apis.jar
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rss/lib/xml-apis.jar?rev=232303&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/nutch/trunk/src/plugin/parse-rss/lib/xml-apis.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/src/plugin/parse-rss/lib/xmlrpc-1.2.jar
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rss/lib/xmlrpc-1.2.jar?rev=232303&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/nutch/trunk/src/plugin/parse-rss/lib/xmlrpc-1.2.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/src/plugin/parse-rss/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rss/plugin.xml?rev=232303&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-rss/plugin.xml (added)
+++ lucene/nutch/trunk/src/plugin/parse-rss/plugin.xml Fri Aug 12 07:23:47 2005
@@ -0,0 +1,41 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<plugin
+   id="parse-rss"
+   name="RSS Parse Plug-in"
+   version="1.0.0"
+   provider-name="edu.usc.cs.cs599">
+
+   <extension-point
+      id="org.apache.nutch.parse.Parser"
+      name="Nutch Content Parser"/>
+
+
+   <runtime>
+      <library name="parse-rss.jar">
+         <export name="*"/>
+      </library>
+      <library name="jdom.jar"/>
+      <library name="jaxen-core.jar"/>
+      <library name="jaxen-jdom.jar"/>
+      <library name="commons-feedparser-0.6-fork.jar"/>
+      <library name="saxpath.jar"/>
+      <library name="commons-httpclient-3.0-beta1.jar"/>
+      <library name="log4j-1.2.6.jar"/>
+      <library name="xercesImpl.jar"/>
+      <library name="xml-apis.jar"/>
+      <library name="xml-rpc-1.2.jar"/>
+
+   </runtime>
+
+   <extension id="org.apache.nutch.parse.rss"
+              name="RssParse"
+              point="org.apache.nutch.parse.Parser">
+
+      <implementation id="org.apache.nutch.parse.rss.RSSParser"
+                      class="org.apache.nutch.parse.rss.RSSParser"
+                      contentType="application/rss+xml"
+                      pathSuffix="rss"/>
+
+   </extension>
+
+</plugin>

Propchange: lucene/nutch/trunk/src/plugin/parse-rss/plugin.xml
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/parse-rss/sample/rsstest.rss
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rss/sample/rsstest.rss?rev=232303&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-rss/sample/rsstest.rss (added)
+++ lucene/nutch/trunk/src/plugin/parse-rss/sample/rsstest.rss Fri Aug 12 07:23:47 2005
@@ -0,0 +1,21 @@
+<?xml version="1.0" encoding="ISO-8859-1" ?>
+<rss version="0.91">
+	<channel>
+		<title>TestChannel</title> 
+		<link>http://test.channel.com/</link> 
+		<description>Sample RSS File for Junit test</description> 
+		<language>en-us</language> 
+       
+        <item>
+            <title>Home Page of Chris Mattmann</title>
+            <link>http://www-scf.usc.edu/~mattmann/</link>
+            <description>Chris Mattmann's home page</description>
+        </item>
+
+		<item>
+			<title>Awesome Open Source Search Engine</title> 
+			<link>http://www.nutch.org/</link> 
+			<description>Yup, that's what it is</description> 
+	    </item>
+	</channel>
+</rss>

Propchange: lucene/nutch/trunk/src/plugin/parse-rss/sample/rsstest.rss
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/FeedParserListenerImpl.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/FeedParserListenerImpl.java?rev=232303&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/FeedParserListenerImpl.java (added)
+++ lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/FeedParserListenerImpl.java Fri Aug 12 07:23:47 2005
@@ -0,0 +1,128 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.rss;
+
+import org.apache.commons.feedparser.DefaultFeedParserListener;
+import org.apache.commons.feedparser.FeedParserState;
+import org.apache.commons.feedparser.FeedParserException;
+
+import java.util.List;
+import java.util.Vector;
+import java.util.Date;
+
+import org.apache.nutch.parse.rss.structs.RSSChannel;
+import org.apache.nutch.parse.rss.structs.RSSItem;
+
+/**
+ * 
+ * @author mattmann
+ * @version 1.0
+ * 
+ * <p>
+ * Feed parser listener class which builds up an RSS Channel model that can be
+ * iterated through to retrieve the parsed information.
+ * </p>
+ */
+public class FeedParserListenerImpl extends DefaultFeedParserListener {
+
+    private List fRssChannels = null;
+
+    private RSSChannel fCurrentChannel = null;
+
+    /**
+     * <p>
+     * Default Constructor
+     * </p>
+     */
+    public FeedParserListenerImpl() {
+        fRssChannels = new Vector();
+    }
+
+    /**
+     * <p>
+     * Gets a {@link List}of {@link RSSChannel}s that the listener parsed from
+     * the RSS document.
+     * </p>
+     * 
+     * @return A {@link List}of {@link RSSChannel}s.
+     */
+    public List getChannels() {
+        if (fRssChannels.size() > 0) {
+            return fRssChannels;
+        } else {
+            //there was only one channel found
+            //add it here, then return it
+            fRssChannels.add(fCurrentChannel);
+            return fRssChannels;
+        }
+    }
+
+    /**
+     * <p>
+     * Callback method when the parser encounters an RSS Channel.
+     * </p>
+     * 
+     * @param state
+     *            The current state of the FeedParser.
+     * @param title
+     *            The title of the RSS Channel.
+     * @param link
+     *            A hyperlink to the RSS Channel.
+     * @param description
+     *            The description of the RSS Channel.
+     */
+    public void onChannel(FeedParserState state, String title, String link,
+            String description) throws FeedParserException {
+
+        //capture the old channel if it's not null
+        if (fCurrentChannel != null) {
+            fRssChannels.add(fCurrentChannel);
+        }
+
+        //System.out.println("Found a new channel: " + title);
+
+        fCurrentChannel = new RSSChannel(title, link, description);
+
+    }
+
+    /**
+     * <p>
+     * Callback method when the parser encounters an RSS Item.
+     * </p>
+     * 
+     * @param state
+     *            The current state of the FeedParser.
+     * @param title
+     *            The title of the RSS Item.
+     * @param link
+     *            A hyperlink to the RSS Item.
+     * @param description
+     *            The description of the RSS Item.
+     * @param permalink
+     *            A permanent link to the RSS Item.
+     */
+    public void onItem(FeedParserState state, String title, String link,
+            String description, String permalink) throws FeedParserException {
+
+        //System.out.println("Found a new published article: " + permalink);
+        if (fCurrentChannel != null) { //should never be null
+            fCurrentChannel.getItems().add(
+                    new RSSItem(title, link, description, permalink));
+        }
+
+    }
+
+}

Propchange: lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/FeedParserListenerImpl.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java?rev=232303&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java (added)
+++ lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java Fri Aug 12 07:23:47 2005
@@ -0,0 +1,228 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.rss;
+
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.LogFormatter;
+import org.apache.nutch.parse.Parser;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.ParseException;
+
+import org.apache.nutch.parse.rss.structs.RSSItem;
+import org.apache.nutch.parse.rss.structs.RSSChannel;
+
+import java.io.ByteArrayInputStream;
+import java.io.InputStream;
+import java.io.IOException;
+
+import java.net.MalformedURLException;
+
+import java.util.Properties;
+import java.util.logging.Logger;
+import java.util.Date;
+import java.util.List;
+import java.util.Vector;
+
+// add all the RSS parsing imports right here
+import org.apache.commons.feedparser.FeedParserState;
+import org.apache.commons.feedparser.FeedParserListener;
+import org.apache.commons.feedparser.FeedParser;
+import org.apache.commons.feedparser.FeedParserException;
+import org.apache.commons.feedparser.FeedParserFactory;
+import org.apache.commons.feedparser.DefaultFeedParserListener;
+import org.apache.commons.feedparser.network.ResourceRequestFactory;
+import org.apache.commons.feedparser.network.ResourceRequest;
+
+/**
+ * 
+ * @author mattmann
+ * @version 1.0
+ * 
+ * <p>
+ * RSS Parser class for nutch
+ * </p>
+ */
+public class RSSParser implements Parser {
+    public static final Logger LOG = LogFormatter
+            .getLogger("org.apache.nutch.parse.rss");
+
+    /**
+     * <p>
+     * Default Constructor
+     * </p>
+     */
+    public RSSParser() {
+
+        // redirect org.apache.log4j.Logger to java's native logger, in order
+        // to, at least, suppress annoying log4j warnings.
+        // Note on 20040614 by Xing:
+        // log4j is used by pdfbox. This snippet'd better be moved
+        // to a common place shared by all parsers that use log4j.
+        org.apache.log4j.Logger rootLogger = org.apache.log4j.Logger
+                .getRootLogger();
+
+        rootLogger.setLevel(org.apache.log4j.Level.INFO);
+
+        org.apache.log4j.Appender appender = new org.apache.log4j.WriterAppender(
+                new org.apache.log4j.SimpleLayout(),
+                org.apache.nutch.util.LogFormatter.getLogStream(this.LOG,
+                        java.util.logging.Level.INFO));
+
+        rootLogger.addAppender(appender);
+    }
+
+    /**
+     * <p>
+     * Implementation method, parses the RSS content, and then returns a
+     * {@link ParseImpl}.
+     * </p>
+     * 
+     * @param content
+     *            The content to parse (hopefully an RSS content stream)
+     * @return A {@link ParseImpl}which implements the {@link Parse}interface.
+     */
+    public Parse getParse(Content content) {
+
+        // check that contentType is one we can handle
+        String contentType = content.getContentType();
+        if (contentType != null
+                && (!contentType.startsWith("text/xml") && !contentType
+                        .startsWith("application/rss+xml")))
+            return new ParseStatus(ParseStatus.FAILED_INVALID_FORMAT,
+                    "Content-Type not text/xml or application/rss+xml: "
+                            + contentType).getEmptyParse();
+
+        List theRSSChannels = null;
+
+        try {
+            byte[] raw = content.getContent();
+
+            // create a new FeedParser...
+            FeedParser parser = FeedParserFactory.newFeedParser();
+
+            // create a listener for handling our callbacks
+            FeedParserListener listener = new FeedParserListenerImpl();
+
+            // start parsing our feed and have the onItem methods called
+            parser.parse(listener, new ByteArrayInputStream(raw), /* resource */
+            null);
+
+            theRSSChannels = ((FeedParserListenerImpl) listener).getChannels();
+
+        } catch (Exception e) { // run time exception
+            e.printStackTrace();
+            LOG.fine("nutch:parse-rss:RSSParser Exception: " + e.getMessage());
+            return new ParseStatus(ParseStatus.FAILED,
+                    "Can't be handled as rss document. " + e).getEmptyParse();
+        }
+
+        StringBuffer contentTitle = new StringBuffer(), indexText = new StringBuffer();
+        List theOutlinks = new Vector();
+
+        // for us, the contentTitle will be a concatenation of the titles of the
+        // RSS Channels that we've parsed
+        // and the index text will be a concatenation of the RSS Channel
+        // descriptions, and descriptions of the RSS Items in the channel
+
+        // also get the outlinks
+
+        if (theRSSChannels != null) {
+            for (int i = 0; i < theRSSChannels.size(); i++) {
+                RSSChannel r = (RSSChannel) theRSSChannels.get(i);
+                contentTitle.append(r.getTitle());
+                contentTitle.append(" ");
+
+                // concat the description to the index text
+                indexText.append(r.getDescription());
+                indexText.append(" ");
+
+                if (r.getLink() != null) {
+                    try {
+                        // get the outlink
+                        theOutlinks.add(new Outlink(r.getLink(), r
+                                .getDescription()));
+                    } catch (MalformedURLException e) {
+                        LOG
+                                .info("nutch:parse-rss:RSSParser Exception: MalformedURL: "
+                                        + r.getLink()
+                                        + ": Attempting to continue processing outlinks");
+                        e.printStackTrace();
+                        continue;
+                    }
+                }
+
+                // now get the descriptions of all the underlying RSS Items and
+                // then index them too
+                for (int j = 0; j < r.getItems().size(); j++) {
+                    RSSItem theRSSItem = (RSSItem) r.getItems().get(j);
+                    indexText.append(theRSSItem.getDescription());
+                    indexText.append(" ");
+
+                    String whichLink = null;
+
+                    if (theRSSItem.getPermalink() != null)
+                        whichLink = theRSSItem.getPermalink();
+                    else
+                        whichLink = theRSSItem.getLink();
+
+                    if (whichLink != null) {
+                        try {
+                            theOutlinks.add(new Outlink(whichLink, theRSSItem
+                                    .getDescription()));
+
+                        } catch (MalformedURLException e) {
+                            LOG
+                                    .info("nutch:parse-rss:RSSParser Exception: MalformedURL: "
+                                            + whichLink
+                                            + ": Attempting to continue processing outlinks");
+                            e.printStackTrace();
+                            continue;
+                        }
+                    }
+
+                }
+
+            }
+
+            LOG.fine("nutch:parse-rss:getParse:indexText=" + indexText);
+            LOG.fine("nutch:parse-rss:getParse:contentTitle=" + contentTitle);
+
+        } else {
+            LOG
+                    .fine("nutch:parse-rss:Error:getParse: No RSS Channels recorded!");
+        }
+
+        // format the outlinks
+
+        Outlink[] outlinks = (Outlink[]) theOutlinks
+                .toArray(new Outlink[theOutlinks.size()]);
+
+        LOG.fine("nutch:parse-rss:getParse:found " + outlinks.length
+                + " outlinks");
+        // LOG.info("Outlinks: "+outlinks);
+
+        ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS,
+                contentTitle.toString(), outlinks, content.getMetadata());
+        return new ParseImpl(indexText.toString(), parseData);
+
+    }
+
+}

Propchange: lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/structs/RSSChannel.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/structs/RSSChannel.java?rev=232303&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/structs/RSSChannel.java (added)
+++ lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/structs/RSSChannel.java Fri Aug 12 07:23:47 2005
@@ -0,0 +1,188 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.rss.structs;
+
+import java.util.List;
+import java.util.Vector;
+
+/**
+ * 
+ * <p>
+ * Data class for holding RSS Channels to send to Nutch's indexer
+ * </p>
+ * 
+ * @author mattmann
+ * @version 1.0
+ */
+public class RSSChannel {
+
+    //description of the channel
+    private String fDescription = null;
+
+    // link to the channel's page
+    private String fLink = null;
+
+    // title of the Channel
+    private String fTitle = null;
+
+    // set of items in the Channel
+    private List fItems = null;
+
+    /**
+     * 
+     * <p>
+     * Default Constructor
+     * </p>
+     * 
+     * @param desc
+     *            The description of the channel.
+     * @param link
+     *            A link to the channel's url.
+     * @param title
+     *            The title of the channel.
+     * @param items
+     *            A list of {@link RSSItem}s for this RSS Channel.
+     */
+    public RSSChannel(String desc, String link, String title, List items) {
+        fDescription = desc;
+        fLink = link;
+        fTitle = title;
+        fItems = items;
+
+    }
+
+    /**
+     * 
+     * <p>
+     * Constructor if you don't have the list of RSS Items ready yet.
+     * </p>
+     * 
+     * @param desc
+     *            The description of the channel.
+     * @param link
+     *            A link to the channel's url.
+     * @param title
+     *            The title of the channel.
+     */
+    public RSSChannel(String desc, String link, String title) {
+        fDescription = desc;
+        fLink = link;
+        fTitle = title;
+        fItems = new Vector();
+
+    }
+
+    /**
+     * 
+     * <p>
+     * Get the list of items for this channel.
+     * </p>
+     * 
+     * @return A list of {@link RSSItem}s.
+     */
+    public List getItems() {
+        return fItems;
+    }
+
+    /**
+     * 
+     * <p>
+     * Returns the channel title
+     * </p>
+     * 
+     * @return The title of the channel.
+     */
+
+    public String getTitle() {
+        return fTitle;
+    }
+
+    /**
+     * 
+     * <p>
+     * Returns a link to the RSS Channel.
+     * </p>
+     * 
+     * @return A {@link String}link to the RSS Channel.
+     */
+    public String getLink() {
+        return fLink;
+    }
+
+    /**
+     * 
+     * <p>
+     * Returns a {@link String}description of the RSS Channel.
+     * </p>
+     * 
+     * @return The description of the RSS Channel.
+     */
+    public String getDescription() {
+        return fDescription;
+    }
+
+    /**
+     * 
+     * <p>
+     * Sets the list of RSS items for this channel.
+     * </p>
+     * 
+     * @param items
+     *            A List of {@link RSSItem}s for this RSSChannel.
+     */
+    public void setItems(List items) {
+        fItems = items;
+    }
+
+    /**
+     * 
+     * <p>
+     * Sets the Title for this RSS Channel.
+     * </p>
+     * 
+     * @param title
+     *            The title of this RSSChannel.
+     */
+    public void setTitle(String title) {
+        fTitle = title;
+    }
+
+    /**
+     * 
+     * <p>
+     * Sets the link to this RSSChannel
+     * </p>
+     * 
+     * @param link
+     *            A {@link String}representation of a link to this RSS Channel.
+     */
+    public void setLink(String link) {
+        fLink = link;
+    }
+
+    /**
+     * 
+     * <p>
+     * Sets the description of this RSSChannel
+     * </p>
+     * 
+     * @param description
+     *            A String description of this RSS Channel.
+     */
+    public void setDescription(String description) {
+        fDescription = description;
+    }
+}

Propchange: lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/structs/RSSChannel.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/structs/RSSItem.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/structs/RSSItem.java?rev=232303&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/structs/RSSItem.java (added)
+++ lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/structs/RSSItem.java Fri Aug 12 07:23:47 2005
@@ -0,0 +1,150 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.rss.structs;
+
+/**
+ * 
+ * <p>
+ * Data class for holding RSS Items to send to Nutch's indexer
+ * </p>
+ * 
+ * @author mattmann
+ * @version 1.0
+ */
+public class RSSItem {
+
+    //The title of this RSS Item
+    private String fTitle = null;
+
+    //The link that this RSS Item points to
+    private String fLink = null;
+
+    //The description of this RSS Item
+    private String fDescription = null;
+
+    //A permanent link that this RSS Item points to
+    private String fPermalink = null;
+
+    public RSSItem(String title, String link, String description,
+            String permalink) {
+        fTitle = title;
+        fLink = link;
+        fDescription = description;
+        fPermalink = permalink;
+    }
+
+    /**
+     * 
+     * <P>
+     * Get the title for this RSS Item
+     * </p>
+     * 
+     * @return The title of this RSS Item
+     */
+    public String getTitle() {
+        return fTitle;
+    }
+
+    /**
+     * 
+     * <p>
+     * Gets the link that this RSS Item points to.
+     * </p>
+     * 
+     * @return The link that this RSS Items points to.
+     */
+    public String getLink() {
+        return fLink;
+    }
+
+    /**
+     * 
+     * <p>
+     * Gets the Description of this RSS Item
+     * </p>
+     * 
+     * @return The description of this RSS Item.
+     */
+    public String getDescription() {
+        return fDescription;
+    }
+
+    /**
+     * 
+     * <p>
+     * If this RSS Item points to a permanent link, then this method returns it.
+     * </p>
+     * 
+     * @return The permanent link that this RSS Items points to.
+     */
+    public String getPermalink() {
+        return fPermalink;
+    }
+
+    /**
+     * 
+     * <p>
+     * Sets the title for this RSS Item.
+     * </p>
+     * 
+     * @param title
+     *            The title of this RSS Item
+     */
+    public void setTitle(String title) {
+        fTitle = title;
+    }
+
+    /**
+     * 
+     * <p>
+     * Sets the link that this RSS Item points to.
+     * </p>
+     * 
+     * @param link
+     *            The link that this RSS Item points to.
+     */
+    public void setLink(String link) {
+        fTitle = link;
+    }
+
+    /**
+     * 
+     * <p>
+     * Sets the description of this RSS Item.
+     * </p>
+     * 
+     * @param description
+     *            The description of this RSS Item.
+     */
+    public void setDescription(String description) {
+        fDescription = description;
+    }
+
+    /**
+     * 
+     * <p>
+     * Sets the permanent link that this RSS Item points to.
+     * </p>
+     * 
+     * @param permalink
+     *            The permanent link that this RSS Item points to
+     */
+    public void setPermalink(String permalink) {
+        fPermalink = permalink;
+    }
+
+}

Propchange: lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/structs/RSSItem.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java?rev=232303&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java (added)
+++ lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java Fri Aug 12 07:23:47 2005
@@ -0,0 +1,128 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.rss;
+
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ProtocolException;
+
+import org.apache.nutch.parse.ParserFactory;
+import org.apache.nutch.parse.Parser;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.Outlink;
+
+import junit.framework.TestCase;
+
+/**
+ * Unit tests for the RSS Parser based on John Xing's TestPdfParser class.
+ * 
+ * @author mattmann
+ * @version 1.0
+ */
+public class TestRSSParser extends TestCase {
+
+    private String fileSeparator = System.getProperty("file.separator");
+
+    // This system property is defined in ./src/plugin/build-plugin.xml
+    private String sampleDir = System.getProperty("test.data", ".");
+
+    // Make sure sample files are copied to "test.data" as specified in
+    // ./src/plugin/parse-rss/build.xml during plugin compilation.
+
+    private String[] sampleFiles = { "rsstest.rss" };
+
+    /**
+     * <p>
+     * Default constructor
+     * </p>
+     * 
+     * @param name
+     *            The name of the RSSParserTest
+     */
+    public TestRSSParser(String name) {
+        super(name);
+    }
+
+    /**
+     * <p>
+     * The test method: tests out the following 2 asserts:
+     * </p>
+     * 
+     * <ul>
+     * <li>There are 3 outlinks read from the sample rss file</li>
+     * <li>The 3 outlinks read are in fact the correct outlinks from the sample
+     * file</li>
+     * </ul>
+     */
+    public void testIt() throws ProtocolException, ParseException {
+        String urlString;
+        Protocol protocol;
+        Content content;
+        Parser parser;
+        Parse parse;
+
+        for (int i = 0; i < sampleFiles.length; i++) {
+            urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
+
+            protocol = ProtocolFactory.getProtocol(urlString);
+            content = protocol.getProtocolOutput(urlString).getContent();
+
+            parser = ParserFactory.getParser(content.getContentType(),
+                    urlString);
+            parse = parser.getParse(content);
+
+            //check that there are 3 outlinks:
+            //http://test.channel.com
+            //http://www-scf.usc.edu/~mattmann/
+            //http://www.nutch.org
+
+            ParseData theParseData = parse.getData();
+
+            Outlink[] theOutlinks = theParseData.getOutlinks();
+
+            assertTrue("There aren't 3 outlinks read!", theOutlinks.length == 3);
+
+            //now check to make sure that those are the two outlinks
+            boolean hasLink1 = false, hasLink2 = false, hasLink3 = false;
+
+            for (int j = 0; j < theOutlinks.length; j++) {
+                //System.out.println("reading "+theOutlinks[j].getToUrl());
+                if (theOutlinks[j].getToUrl().equals(
+                        "http://www-scf.usc.edu/~mattmann/")) {
+                    hasLink1 = true;
+                }
+
+                if (theOutlinks[j].getToUrl().equals("http://www.nutch.org/")) {
+                    hasLink2 = true;
+                }
+
+                if (theOutlinks[j].getToUrl()
+                        .equals("http://test.channel.com/")) {
+                    hasLink3 = true;
+                }
+            }
+
+            if (!hasLink1 || !hasLink2 || !hasLink3) {
+                fail("Outlinks read from sample rss file are not correct!");
+            }
+        }
+    }
+
+}

Propchange: lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java
------------------------------------------------------------------------------
    svn:eol-style = native