You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2005/08/12 16:25:38 UTC
svn commit: r232303 - in /lucene/nutch/trunk/src/plugin: ./ parse-rss/
parse-rss/lib/ parse-rss/sample/ parse-rss/src/ parse-rss/src/java/
parse-rss/src/java/org/ parse-rss/src/java/org/apache/
parse-rss/src/java/org/apache/nutch/ parse-rss/src/java/or...
Author: ab
Date: Fri Aug 12 07:23:47 2005
New Revision: 232303
URL: http://svn.apache.org/viewcvs?rev=232303&view=rev
Log:
RSS Parse plugin. Contributed by Chris Mattmann (issue NUTCH-30).
Thank you!
Added:
lucene/nutch/trunk/src/plugin/parse-rss/
lucene/nutch/trunk/src/plugin/parse-rss/build.xml (with props)
lucene/nutch/trunk/src/plugin/parse-rss/lib/
lucene/nutch/trunk/src/plugin/parse-rss/lib/commons-feedparser-0.6-fork.jar (with props)
lucene/nutch/trunk/src/plugin/parse-rss/lib/commons-httpclient-3.0-beta1.jar (with props)
lucene/nutch/trunk/src/plugin/parse-rss/lib/jaxen-core.jar (with props)
lucene/nutch/trunk/src/plugin/parse-rss/lib/jaxen-jdom.jar (with props)
lucene/nutch/trunk/src/plugin/parse-rss/lib/jdom.jar (with props)
lucene/nutch/trunk/src/plugin/parse-rss/lib/log4j-1.2.6.jar (with props)
lucene/nutch/trunk/src/plugin/parse-rss/lib/saxpath.jar (with props)
lucene/nutch/trunk/src/plugin/parse-rss/lib/xercesImpl.jar (with props)
lucene/nutch/trunk/src/plugin/parse-rss/lib/xml-apis.jar (with props)
lucene/nutch/trunk/src/plugin/parse-rss/lib/xmlrpc-1.2.jar (with props)
lucene/nutch/trunk/src/plugin/parse-rss/plugin.xml (with props)
lucene/nutch/trunk/src/plugin/parse-rss/sample/
lucene/nutch/trunk/src/plugin/parse-rss/sample/rsstest.rss (with props)
lucene/nutch/trunk/src/plugin/parse-rss/src/
lucene/nutch/trunk/src/plugin/parse-rss/src/java/
lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/
lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/
lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/
lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/
lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/
lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/FeedParserListenerImpl.java (with props)
lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java (with props)
lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/structs/
lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/structs/RSSChannel.java (with props)
lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/structs/RSSItem.java (with props)
lucene/nutch/trunk/src/plugin/parse-rss/src/test/
lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/
lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/
lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/
lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/
lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/
lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java (with props)
Modified:
lucene/nutch/trunk/src/plugin/build.xml
Modified: lucene/nutch/trunk/src/plugin/build.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/build.xml?rev=232303&r1=232302&r2=232303&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/build.xml Fri Aug 12 07:23:47 2005
@@ -14,6 +14,7 @@
<ant dir="parse-js" target="deploy"/>
<ant dir="parse-text" target="deploy"/>
<ant dir="parse-pdf" target="deploy"/>
+ <ant dir="parse-rss" target="deploy"/>
<ant dir="parse-msword" target="deploy"/>
<!-- <ant dir="parse-mp3" target="deploy"/> -->
<!-- <ant dir="parse-rtf" target="deploy"/> -->
@@ -39,6 +40,7 @@
<ant dir="protocol-http" target="test"/>
<ant dir="parse-html" target="test"/>
<ant dir="parse-pdf" target="test"/>
+ <ant dir="parse-rss" target="test"/>
<ant dir="parse-msword" target="test"/>
<!-- <ant dir="parse-mp3" target="test"/> -->
<!-- <ant dir="parse-rtf" target="test"/> -->
@@ -60,6 +62,7 @@
<ant dir="parse-js" target="clean"/>
<ant dir="parse-text" target="clean"/>
<ant dir="parse-pdf" target="clean"/>
+ <ant dir="parse-rss" target="clean"/>
<ant dir="parse-msword" target="clean"/>
<ant dir="parse-mp3" target="clean"/>
<ant dir="parse-rtf" target="clean"/>
Added: lucene/nutch/trunk/src/plugin/parse-rss/build.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rss/build.xml?rev=232303&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-rss/build.xml (added)
+++ lucene/nutch/trunk/src/plugin/parse-rss/build.xml Fri Aug 12 07:23:47 2005
@@ -0,0 +1,13 @@
+<?xml version="1.0"?>
+
+<project name="parse-rss" default="jar">
+
+ <import file="../build-plugin.xml"/>
+
+
+ <!-- for junit test -->
+ <mkdir dir="${build.test}/data"/>
+ <copy file="sample/rsstest.rss" todir="${build.test}/data"/>
+
+
+</project>
Propchange: lucene/nutch/trunk/src/plugin/parse-rss/build.xml
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/nutch/trunk/src/plugin/parse-rss/lib/commons-feedparser-0.6-fork.jar
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rss/lib/commons-feedparser-0.6-fork.jar?rev=232303&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/nutch/trunk/src/plugin/parse-rss/lib/commons-feedparser-0.6-fork.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/nutch/trunk/src/plugin/parse-rss/lib/commons-httpclient-3.0-beta1.jar
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rss/lib/commons-httpclient-3.0-beta1.jar?rev=232303&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/nutch/trunk/src/plugin/parse-rss/lib/commons-httpclient-3.0-beta1.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/nutch/trunk/src/plugin/parse-rss/lib/jaxen-core.jar
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rss/lib/jaxen-core.jar?rev=232303&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/nutch/trunk/src/plugin/parse-rss/lib/jaxen-core.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/nutch/trunk/src/plugin/parse-rss/lib/jaxen-jdom.jar
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rss/lib/jaxen-jdom.jar?rev=232303&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/nutch/trunk/src/plugin/parse-rss/lib/jaxen-jdom.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/nutch/trunk/src/plugin/parse-rss/lib/jdom.jar
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rss/lib/jdom.jar?rev=232303&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/nutch/trunk/src/plugin/parse-rss/lib/jdom.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/nutch/trunk/src/plugin/parse-rss/lib/log4j-1.2.6.jar
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rss/lib/log4j-1.2.6.jar?rev=232303&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/nutch/trunk/src/plugin/parse-rss/lib/log4j-1.2.6.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/nutch/trunk/src/plugin/parse-rss/lib/saxpath.jar
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rss/lib/saxpath.jar?rev=232303&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/nutch/trunk/src/plugin/parse-rss/lib/saxpath.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/nutch/trunk/src/plugin/parse-rss/lib/xercesImpl.jar
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rss/lib/xercesImpl.jar?rev=232303&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/nutch/trunk/src/plugin/parse-rss/lib/xercesImpl.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/nutch/trunk/src/plugin/parse-rss/lib/xml-apis.jar
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rss/lib/xml-apis.jar?rev=232303&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/nutch/trunk/src/plugin/parse-rss/lib/xml-apis.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/nutch/trunk/src/plugin/parse-rss/lib/xmlrpc-1.2.jar
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rss/lib/xmlrpc-1.2.jar?rev=232303&view=auto
==============================================================================
Binary file - no diff available.
Propchange: lucene/nutch/trunk/src/plugin/parse-rss/lib/xmlrpc-1.2.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: lucene/nutch/trunk/src/plugin/parse-rss/plugin.xml
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rss/plugin.xml?rev=232303&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-rss/plugin.xml (added)
+++ lucene/nutch/trunk/src/plugin/parse-rss/plugin.xml Fri Aug 12 07:23:47 2005
@@ -0,0 +1,41 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<plugin
+ id="parse-rss"
+ name="RSS Parse Plug-in"
+ version="1.0.0"
+ provider-name="edu.usc.cs.cs599">
+
+ <extension-point
+ id="org.apache.nutch.parse.Parser"
+ name="Nutch Content Parser"/>
+
+
+ <runtime>
+ <library name="parse-rss.jar">
+ <export name="*"/>
+ </library>
+ <library name="jdom.jar"/>
+ <library name="jaxen-core.jar"/>
+ <library name="jaxen-jdom.jar"/>
+ <library name="commons-feedparser-0.6-fork.jar"/>
+ <library name="saxpath.jar"/>
+ <library name="commons-httpclient-3.0-beta1.jar"/>
+ <library name="log4j-1.2.6.jar"/>
+ <library name="xercesImpl.jar"/>
+ <library name="xml-apis.jar"/>
+ <library name="xml-rpc-1.2.jar"/>
+
+ </runtime>
+
+ <extension id="org.apache.nutch.parse.rss"
+ name="RssParse"
+ point="org.apache.nutch.parse.Parser">
+
+ <implementation id="org.apache.nutch.parse.rss.RSSParser"
+ class="org.apache.nutch.parse.rss.RSSParser"
+ contentType="application/rss+xml"
+ pathSuffix="rss"/>
+
+ </extension>
+
+</plugin>
Propchange: lucene/nutch/trunk/src/plugin/parse-rss/plugin.xml
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/nutch/trunk/src/plugin/parse-rss/sample/rsstest.rss
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rss/sample/rsstest.rss?rev=232303&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-rss/sample/rsstest.rss (added)
+++ lucene/nutch/trunk/src/plugin/parse-rss/sample/rsstest.rss Fri Aug 12 07:23:47 2005
@@ -0,0 +1,21 @@
+<?xml version="1.0" encoding="ISO-8859-1" ?>
+<rss version="0.91">
+ <channel>
+ <title>TestChannel</title>
+ <link>http://test.channel.com/</link>
+ <description>Sample RSS File for Junit test</description>
+ <language>en-us</language>
+
+ <item>
+ <title>Home Page of Chris Mattmann</title>
+ <link>http://www-scf.usc.edu/~mattmann/</link>
+ <description>Chris Mattmann's home page</description>
+ </item>
+
+ <item>
+ <title>Awesome Open Source Search Engine</title>
+ <link>http://www.nutch.org/</link>
+ <description>Yup, that's what it is</description>
+ </item>
+ </channel>
+</rss>
Propchange: lucene/nutch/trunk/src/plugin/parse-rss/sample/rsstest.rss
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/FeedParserListenerImpl.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/FeedParserListenerImpl.java?rev=232303&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/FeedParserListenerImpl.java (added)
+++ lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/FeedParserListenerImpl.java Fri Aug 12 07:23:47 2005
@@ -0,0 +1,128 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.rss;
+
+import org.apache.commons.feedparser.DefaultFeedParserListener;
+import org.apache.commons.feedparser.FeedParserState;
+import org.apache.commons.feedparser.FeedParserException;
+
+import java.util.List;
+import java.util.Vector;
+import java.util.Date;
+
+import org.apache.nutch.parse.rss.structs.RSSChannel;
+import org.apache.nutch.parse.rss.structs.RSSItem;
+
+/**
+ *
+ * @author mattmann
+ * @version 1.0
+ *
+ * <p>
+ * Feed parser listener class which builds up an RSS Channel model that can be
+ * iterated through to retrieve the parsed information.
+ * </p>
+ */
+public class FeedParserListenerImpl extends DefaultFeedParserListener {
+
+ private List fRssChannels = null;
+
+ private RSSChannel fCurrentChannel = null;
+
+ /**
+ * <p>
+ * Default Constructor
+ * </p>
+ */
+ public FeedParserListenerImpl() {
+ fRssChannels = new Vector();
+ }
+
+ /**
+ * <p>
+ * Gets a {@link List}of {@link RSSChannel}s that the listener parsed from
+ * the RSS document.
+ * </p>
+ *
+ * @return A {@link List}of {@link RSSChannel}s.
+ */
+ public List getChannels() {
+ if (fRssChannels.size() > 0) {
+ return fRssChannels;
+ } else {
+ //there was only one channel found
+ //add it here, then return it
+ fRssChannels.add(fCurrentChannel);
+ return fRssChannels;
+ }
+ }
+
+ /**
+ * <p>
+ * Callback method when the parser encounters an RSS Channel.
+ * </p>
+ *
+ * @param state
+ * The current state of the FeedParser.
+ * @param title
+ * The title of the RSS Channel.
+ * @param link
+ * A hyperlink to the RSS Channel.
+ * @param description
+ * The description of the RSS Channel.
+ */
+ public void onChannel(FeedParserState state, String title, String link,
+ String description) throws FeedParserException {
+
+ //capture the old channel if it's not null
+ if (fCurrentChannel != null) {
+ fRssChannels.add(fCurrentChannel);
+ }
+
+ //System.out.println("Found a new channel: " + title);
+
+ fCurrentChannel = new RSSChannel(title, link, description);
+
+ }
+
+ /**
+ * <p>
+ * Callback method when the parser encounters an RSS Item.
+ * </p>
+ *
+ * @param state
+ * The current state of the FeedParser.
+ * @param title
+ * The title of the RSS Item.
+ * @param link
+ * A hyperlink to the RSS Item.
+ * @param description
+ * The description of the RSS Item.
+ * @param permalink
+ * A permanent link to the RSS Item.
+ */
+ public void onItem(FeedParserState state, String title, String link,
+ String description, String permalink) throws FeedParserException {
+
+ //System.out.println("Found a new published article: " + permalink);
+ if (fCurrentChannel != null) { //should never be null
+ fCurrentChannel.getItems().add(
+ new RSSItem(title, link, description, permalink));
+ }
+
+ }
+
+}
Propchange: lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/FeedParserListenerImpl.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java?rev=232303&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java (added)
+++ lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java Fri Aug 12 07:23:47 2005
@@ -0,0 +1,228 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.rss;
+
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.LogFormatter;
+import org.apache.nutch.parse.Parser;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.ParseException;
+
+import org.apache.nutch.parse.rss.structs.RSSItem;
+import org.apache.nutch.parse.rss.structs.RSSChannel;
+
+import java.io.ByteArrayInputStream;
+import java.io.InputStream;
+import java.io.IOException;
+
+import java.net.MalformedURLException;
+
+import java.util.Properties;
+import java.util.logging.Logger;
+import java.util.Date;
+import java.util.List;
+import java.util.Vector;
+
+// add all the RSS parsing imports right here
+import org.apache.commons.feedparser.FeedParserState;
+import org.apache.commons.feedparser.FeedParserListener;
+import org.apache.commons.feedparser.FeedParser;
+import org.apache.commons.feedparser.FeedParserException;
+import org.apache.commons.feedparser.FeedParserFactory;
+import org.apache.commons.feedparser.DefaultFeedParserListener;
+import org.apache.commons.feedparser.network.ResourceRequestFactory;
+import org.apache.commons.feedparser.network.ResourceRequest;
+
+/**
+ *
+ * @author mattmann
+ * @version 1.0
+ *
+ * <p>
+ * RSS Parser class for nutch
+ * </p>
+ */
+public class RSSParser implements Parser {
+ public static final Logger LOG = LogFormatter
+ .getLogger("org.apache.nutch.parse.rss");
+
+ /**
+ * <p>
+ * Default Constructor
+ * </p>
+ */
+ public RSSParser() {
+
+ // redirect org.apache.log4j.Logger to java's native logger, in order
+ // to, at least, suppress annoying log4j warnings.
+ // Note on 20040614 by Xing:
+ // log4j is used by pdfbox. This snippet'd better be moved
+ // to a common place shared by all parsers that use log4j.
+ org.apache.log4j.Logger rootLogger = org.apache.log4j.Logger
+ .getRootLogger();
+
+ rootLogger.setLevel(org.apache.log4j.Level.INFO);
+
+ org.apache.log4j.Appender appender = new org.apache.log4j.WriterAppender(
+ new org.apache.log4j.SimpleLayout(),
+ org.apache.nutch.util.LogFormatter.getLogStream(this.LOG,
+ java.util.logging.Level.INFO));
+
+ rootLogger.addAppender(appender);
+ }
+
+ /**
+ * <p>
+ * Implementation method, parses the RSS content, and then returns a
+ * {@link ParseImpl}.
+ * </p>
+ *
+ * @param content
+ * The content to parse (hopefully an RSS content stream)
+ * @return A {@link ParseImpl}which implements the {@link Parse}interface.
+ */
+ public Parse getParse(Content content) {
+
+ // check that contentType is one we can handle
+ String contentType = content.getContentType();
+ if (contentType != null
+ && (!contentType.startsWith("text/xml") && !contentType
+ .startsWith("application/rss+xml")))
+ return new ParseStatus(ParseStatus.FAILED_INVALID_FORMAT,
+ "Content-Type not text/xml or application/rss+xml: "
+ + contentType).getEmptyParse();
+
+ List theRSSChannels = null;
+
+ try {
+ byte[] raw = content.getContent();
+
+ // create a new FeedParser...
+ FeedParser parser = FeedParserFactory.newFeedParser();
+
+ // create a listener for handling our callbacks
+ FeedParserListener listener = new FeedParserListenerImpl();
+
+ // start parsing our feed and have the onItem methods called
+ parser.parse(listener, new ByteArrayInputStream(raw), /* resource */
+ null);
+
+ theRSSChannels = ((FeedParserListenerImpl) listener).getChannels();
+
+ } catch (Exception e) { // run time exception
+ e.printStackTrace();
+ LOG.fine("nutch:parse-rss:RSSParser Exception: " + e.getMessage());
+ return new ParseStatus(ParseStatus.FAILED,
+ "Can't be handled as rss document. " + e).getEmptyParse();
+ }
+
+ StringBuffer contentTitle = new StringBuffer(), indexText = new StringBuffer();
+ List theOutlinks = new Vector();
+
+ // for us, the contentTitle will be a concatenation of the titles of the
+ // RSS Channels that we've parsed
+ // and the index text will be a concatenation of the RSS Channel
+ // descriptions, and descriptions of the RSS Items in the channel
+
+ // also get the outlinks
+
+ if (theRSSChannels != null) {
+ for (int i = 0; i < theRSSChannels.size(); i++) {
+ RSSChannel r = (RSSChannel) theRSSChannels.get(i);
+ contentTitle.append(r.getTitle());
+ contentTitle.append(" ");
+
+ // concat the description to the index text
+ indexText.append(r.getDescription());
+ indexText.append(" ");
+
+ if (r.getLink() != null) {
+ try {
+ // get the outlink
+ theOutlinks.add(new Outlink(r.getLink(), r
+ .getDescription()));
+ } catch (MalformedURLException e) {
+ LOG
+ .info("nutch:parse-rss:RSSParser Exception: MalformedURL: "
+ + r.getLink()
+ + ": Attempting to continue processing outlinks");
+ e.printStackTrace();
+ continue;
+ }
+ }
+
+ // now get the descriptions of all the underlying RSS Items and
+ // then index them too
+ for (int j = 0; j < r.getItems().size(); j++) {
+ RSSItem theRSSItem = (RSSItem) r.getItems().get(j);
+ indexText.append(theRSSItem.getDescription());
+ indexText.append(" ");
+
+ String whichLink = null;
+
+ if (theRSSItem.getPermalink() != null)
+ whichLink = theRSSItem.getPermalink();
+ else
+ whichLink = theRSSItem.getLink();
+
+ if (whichLink != null) {
+ try {
+ theOutlinks.add(new Outlink(whichLink, theRSSItem
+ .getDescription()));
+
+ } catch (MalformedURLException e) {
+ LOG
+ .info("nutch:parse-rss:RSSParser Exception: MalformedURL: "
+ + whichLink
+ + ": Attempting to continue processing outlinks");
+ e.printStackTrace();
+ continue;
+ }
+ }
+
+ }
+
+ }
+
+ LOG.fine("nutch:parse-rss:getParse:indexText=" + indexText);
+ LOG.fine("nutch:parse-rss:getParse:contentTitle=" + contentTitle);
+
+ } else {
+ LOG
+ .fine("nutch:parse-rss:Error:getParse: No RSS Channels recorded!");
+ }
+
+ // format the outlinks
+
+ Outlink[] outlinks = (Outlink[]) theOutlinks
+ .toArray(new Outlink[theOutlinks.size()]);
+
+ LOG.fine("nutch:parse-rss:getParse:found " + outlinks.length
+ + " outlinks");
+ // LOG.info("Outlinks: "+outlinks);
+
+ ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS,
+ contentTitle.toString(), outlinks, content.getMetadata());
+ return new ParseImpl(indexText.toString(), parseData);
+
+ }
+
+}
Propchange: lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/structs/RSSChannel.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/structs/RSSChannel.java?rev=232303&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/structs/RSSChannel.java (added)
+++ lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/structs/RSSChannel.java Fri Aug 12 07:23:47 2005
@@ -0,0 +1,188 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.rss.structs;
+
+import java.util.List;
+import java.util.Vector;
+
+/**
+ *
+ * <p>
+ * Data class for holding RSS Channels to send to Nutch's indexer
+ * </p>
+ *
+ * @author mattmann
+ * @version 1.0
+ */
+public class RSSChannel {
+
+ //description of the channel
+ private String fDescription = null;
+
+ // link to the channel's page
+ private String fLink = null;
+
+ // title of the Channel
+ private String fTitle = null;
+
+ // set of items in the Channel
+ private List fItems = null;
+
+ /**
+ *
+ * <p>
+ * Default Constructor
+ * </p>
+ *
+ * @param desc
+ * The description of the channel.
+ * @param link
+ * A link to the channel's url.
+ * @param title
+ * The title of the channel.
+ * @param items
+ * A list of {@link RSSItem}s for this RSS Channel.
+ */
+ public RSSChannel(String desc, String link, String title, List items) {
+ fDescription = desc;
+ fLink = link;
+ fTitle = title;
+ fItems = items;
+
+ }
+
+ /**
+ *
+ * <p>
+ * Constructor if you don't have the list of RSS Items ready yet.
+ * </p>
+ *
+ * @param desc
+ * The description of the channel.
+ * @param link
+ * A link to the channel's url.
+ * @param title
+ * The title of the channel.
+ */
+ public RSSChannel(String desc, String link, String title) {
+ fDescription = desc;
+ fLink = link;
+ fTitle = title;
+ fItems = new Vector();
+
+ }
+
+ /**
+ *
+ * <p>
+ * Get the list of items for this channel.
+ * </p>
+ *
+ * @return A list of {@link RSSItem}s.
+ */
+ public List getItems() {
+ return fItems;
+ }
+
+ /**
+ *
+ * <p>
+ * Returns the channel title
+ * </p>
+ *
+ * @return The title of the channel.
+ */
+
+ public String getTitle() {
+ return fTitle;
+ }
+
+ /**
+ *
+ * <p>
+ * Returns a link to the RSS Channel.
+ * </p>
+ *
+ * @return A {@link String}link to the RSS Channel.
+ */
+ public String getLink() {
+ return fLink;
+ }
+
+ /**
+ *
+ * <p>
+ * Returns a {@link String}description of the RSS Channel.
+ * </p>
+ *
+ * @return The description of the RSS Channel.
+ */
+ public String getDescription() {
+ return fDescription;
+ }
+
+ /**
+ *
+ * <p>
+ * Sets the list of RSS items for this channel.
+ * </p>
+ *
+ * @param items
+ * A List of {@link RSSItem}s for this RSSChannel.
+ */
+ public void setItems(List items) {
+ fItems = items;
+ }
+
+ /**
+ *
+ * <p>
+ * Sets the Title for this RSS Channel.
+ * </p>
+ *
+ * @param title
+ * The title of this RSSChannel.
+ */
+ public void setTitle(String title) {
+ fTitle = title;
+ }
+
+ /**
+ *
+ * <p>
+ * Sets the link to this RSSChannel
+ * </p>
+ *
+ * @param link
+ * A {@link String}representation of a link to this RSS Channel.
+ */
+ public void setLink(String link) {
+ fLink = link;
+ }
+
+ /**
+ *
+ * <p>
+ * Sets the description of this RSSChannel
+ * </p>
+ *
+ * @param description
+ * A String description of this RSS Channel.
+ */
+ public void setDescription(String description) {
+ fDescription = description;
+ }
+}
Propchange: lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/structs/RSSChannel.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/structs/RSSItem.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/structs/RSSItem.java?rev=232303&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/structs/RSSItem.java (added)
+++ lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/structs/RSSItem.java Fri Aug 12 07:23:47 2005
@@ -0,0 +1,150 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.rss.structs;
+
+/**
+ *
+ * <p>
+ * Data class for holding RSS Items to send to Nutch's indexer
+ * </p>
+ *
+ * @author mattmann
+ * @version 1.0
+ */
+public class RSSItem {
+
+ //The title of this RSS Item
+ private String fTitle = null;
+
+ //The link that this RSS Item points to
+ private String fLink = null;
+
+ //The description of this RSS Item
+ private String fDescription = null;
+
+ //A permanent link that this RSS Item points to
+ private String fPermalink = null;
+
+ public RSSItem(String title, String link, String description,
+ String permalink) {
+ fTitle = title;
+ fLink = link;
+ fDescription = description;
+ fPermalink = permalink;
+ }
+
+ /**
+ *
+ * <P>
+ * Get the title for this RSS Item
+ * </p>
+ *
+ * @return The title of this RSS Item
+ */
+ public String getTitle() {
+ return fTitle;
+ }
+
+ /**
+ *
+ * <p>
+ * Gets the link that this RSS Item points to.
+ * </p>
+ *
+ * @return The link that this RSS Items points to.
+ */
+ public String getLink() {
+ return fLink;
+ }
+
+ /**
+ *
+ * <p>
+ * Gets the Description of this RSS Item
+ * </p>
+ *
+ * @return The description of this RSS Item.
+ */
+ public String getDescription() {
+ return fDescription;
+ }
+
+ /**
+ *
+ * <p>
+ * If this RSS Item points to a permanent link, then this method returns it.
+ * </p>
+ *
+ * @return The permanent link that this RSS Items points to.
+ */
+ public String getPermalink() {
+ return fPermalink;
+ }
+
+ /**
+ *
+ * <p>
+ * Sets the title for this RSS Item.
+ * </p>
+ *
+ * @param title
+ * The title of this RSS Item
+ */
+ public void setTitle(String title) {
+ fTitle = title;
+ }
+
+ /**
+ *
+ * <p>
+ * Sets the link that this RSS Item points to.
+ * </p>
+ *
+ * @param link
+ * The link that this RSS Item points to.
+ */
+ public void setLink(String link) {
+ fTitle = link;
+ }
+
+ /**
+ *
+ * <p>
+ * Sets the description of this RSS Item.
+ * </p>
+ *
+ * @param description
+ * The description of this RSS Item.
+ */
+ public void setDescription(String description) {
+ fDescription = description;
+ }
+
+ /**
+ *
+ * <p>
+ * Sets the permanent link that this RSS Item points to.
+ * </p>
+ *
+ * @param permalink
+ * The permanent link that this RSS Item points to
+ */
+ public void setPermalink(String permalink) {
+ fPermalink = permalink;
+ }
+
+}
Propchange: lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/structs/RSSItem.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java
URL: http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java?rev=232303&view=auto
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java (added)
+++ lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java Fri Aug 12 07:23:47 2005
@@ -0,0 +1,128 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.rss;
+
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ProtocolException;
+
+import org.apache.nutch.parse.ParserFactory;
+import org.apache.nutch.parse.Parser;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.Outlink;
+
+import junit.framework.TestCase;
+
+/**
+ * Unit tests for the RSS Parser based on John Xing's TestPdfParser class.
+ *
+ * @author mattmann
+ * @version 1.0
+ */
+public class TestRSSParser extends TestCase {
+
+ private String fileSeparator = System.getProperty("file.separator");
+
+ // This system property is defined in ./src/plugin/build-plugin.xml
+ private String sampleDir = System.getProperty("test.data", ".");
+
+ // Make sure sample files are copied to "test.data" as specified in
+ // ./src/plugin/parse-rss/build.xml during plugin compilation.
+
+ private String[] sampleFiles = { "rsstest.rss" };
+
+ /**
+ * <p>
+ * Default constructor
+ * </p>
+ *
+ * @param name
+ * The name of the RSSParserTest
+ */
+ public TestRSSParser(String name) {
+ super(name);
+ }
+
+ /**
+ * <p>
+ * The test method: tests out the following 2 asserts:
+ * </p>
+ *
+ * <ul>
+ * <li>There are 3 outlinks read from the sample rss file</li>
+ * <li>The 3 outlinks read are in fact the correct outlinks from the sample
+ * file</li>
+ * </ul>
+ */
+ public void testIt() throws ProtocolException, ParseException {
+ String urlString;
+ Protocol protocol;
+ Content content;
+ Parser parser;
+ Parse parse;
+
+ for (int i = 0; i < sampleFiles.length; i++) {
+ urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
+
+ protocol = ProtocolFactory.getProtocol(urlString);
+ content = protocol.getProtocolOutput(urlString).getContent();
+
+ parser = ParserFactory.getParser(content.getContentType(),
+ urlString);
+ parse = parser.getParse(content);
+
+ //check that there are 3 outlinks:
+ //http://test.channel.com
+ //http://www-scf.usc.edu/~mattmann/
+ //http://www.nutch.org
+
+ ParseData theParseData = parse.getData();
+
+ Outlink[] theOutlinks = theParseData.getOutlinks();
+
+ assertTrue("There aren't 3 outlinks read!", theOutlinks.length == 3);
+
+ //now check to make sure that those are the two outlinks
+ boolean hasLink1 = false, hasLink2 = false, hasLink3 = false;
+
+ for (int j = 0; j < theOutlinks.length; j++) {
+ //System.out.println("reading "+theOutlinks[j].getToUrl());
+ if (theOutlinks[j].getToUrl().equals(
+ "http://www-scf.usc.edu/~mattmann/")) {
+ hasLink1 = true;
+ }
+
+ if (theOutlinks[j].getToUrl().equals("http://www.nutch.org/")) {
+ hasLink2 = true;
+ }
+
+ if (theOutlinks[j].getToUrl()
+ .equals("http://test.channel.com/")) {
+ hasLink3 = true;
+ }
+ }
+
+ if (!hasLink1 || !hasLink2 || !hasLink3) {
+ fail("Outlinks read from sample rss file are not correct!");
+ }
+ }
+ }
+
+}
Propchange: lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java
------------------------------------------------------------------------------
svn:eol-style = native