You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@commons.apache.org by bu...@apache.org on 2004/09/01 01:56:14 UTC
cvs commit: jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate FeedLocator.java LinkLocator.java ProbeLocator.java
burton 2004/08/31 16:56:14
Modified: feedparser build.xml
feedparser/src/java/org/apache/commons/feedparser/locate
FeedLocator.java LinkLocator.java ProbeLocator.java
Log:
patches from Brad Neuberg to perform more advanced probe location... added some cleanup around new feed refs
Revision Changes Path
1.5 +37 -1 jakarta-commons-sandbox/feedparser/build.xml
Index: build.xml
===================================================================
RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/build.xml,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -r1.4 -r1.5
--- build.xml 4 Jul 2004 06:31:10 -0000 1.4
+++ build.xml 31 Aug 2004 23:56:13 -0000 1.5
@@ -29,10 +29,16 @@
</fileset>
<fileset dir="/projects/ksa/lib">
+ <include name="*.jar"/>
+ </fileset>
+
+ <!--
+ <fileset dir="c:/usr/local/ksa/lib">
<include name="*.jar"/>
</fileset>
+ -->
</path>
@@ -41,12 +47,39 @@
<mkdir dir="compile"/>
<javac srcdir="src/java/"
- compiler="jikes"
classpathref="project.classpath"
destdir="compile"
debug="true"/>
</target>
+
+ <target name="run-example" description="Runs the example feed parser class"
+ depends="jakarta-feedparser.jar">
+ <java classpathref="project.classpath"
+ classname="org.apache.commons.feedparser.example.HelloFeedParser"
+ fork="true"
+ failonerror="true">
+ </java>
+ </target>
+
+ <target name="debug-feed" description="Debugs a feed"
+ depends="jakarta-feedparser.jar">
+ <java classpathref="project.classpath"
+ classname="org.apache.commons.feedparser.Main"
+ fork="true"
+ failonerror="true">
+ <arg value="${feed-url}"/>
+ </java>
+ </target>
+
+ <target name="test-autodiscover" description="Runs the testing class for autodiscovery"
+ depends="jakarta-feedparser.jar">
+ <java classpathref="project.classpath"
+ classname="org.apache.commons.feedparser.locate.FeedLocator"
+ fork="true"
+ failonerror="true">
+ </java>
+ </target>
<target name="clean">
<delete dir="compile"/>
@@ -83,6 +116,7 @@
<formatter type="plain" usefile="false"/>
+ <test name="org.apache.commons.feedparser.test.TestProbeLocator"/>
<test name="org.apache.commons.feedparser.test.TestAtom"/>
<test name="org.apache.commons.feedparser.test.TestFeedParserUTF8"/>
@@ -93,7 +127,9 @@
</target>
<target name="javadoc">
+
<mkdir dir="${build.dir}/docs/api"/>
+
<javadoc sourcepath="src/java"
packagenames="org.apache.commons.feedparser.*"
destdir="docs/api"
1.18 +17 -5 jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/FeedLocator.java
Index: FeedLocator.java
===================================================================
RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/FeedLocator.java,v
retrieving revision 1.17
retrieving revision 1.18
diff -u -r1.17 -r1.18
--- FeedLocator.java 20 Aug 2004 21:44:06 -0000 1.17
+++ FeedLocator.java 31 Aug 2004 23:56:14 -0000 1.18
@@ -20,18 +20,30 @@
import org.peerfear.newsmonster.network.*;
-import java.io.*;
import java.util.*;
/**
- * Method to determine feed URLs from a given.
+ * Method to determine feed URLs from a given resource URI. For example,
+ * you would pass in the URI:
+ *
+ * http://www.codinginparadise.org
+ *
+ * and this class would pass back a List with one address of the feed URL,
+ * which is
+ *
+ * http://www.codinginparadise.org/weblog/atom.xml"
+ *
+ * <code>
+ * String resource = "http://www.codinginparadise.org";
+ * FeedList l = FeedLocator.locate( resource );
+ * </code>
*
* @author <a href="mailto:burton@apache.org">Kevin A. Burton</a>
*/
public class FeedLocator {
-
+
/**
- * Locate all feeds within the give resource. The resource should be a link
+ * Locate all feeds within the given resource. The resource should be a link
* to an (X)HTML document, usually a weblog or a website.
*
* Example: http://peerfear.org
@@ -66,7 +78,7 @@
//this failed... try looking for links
LinkLocator.locate( resource, content, list );
-
+
//this failed... try probe location. This is more reliable than
//LinkLocation but requires a few more HTTP gets.
if ( list.size() == 0 )
1.8 +1 -1 jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/LinkLocator.java
Index: LinkLocator.java
===================================================================
RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/LinkLocator.java,v
retrieving revision 1.7
retrieving revision 1.8
diff -u -r1.7 -r1.8
--- LinkLocator.java 31 Aug 2004 22:35:52 -0000 1.7
+++ LinkLocator.java 31 Aug 2004 23:56:14 -0000 1.8
@@ -140,7 +140,7 @@
if ( current.endsWith( ".atom" ) ) {
FeedReference ref = new FeedReference( current,
- FeedReference.ATOM_MEDIA_TYPE );
+ FeedReference.RSS_MEDIA_TYPE );
//Make sure to preserve existing AD feeds first.
if ( ! hasExplicitAtomFeed )
1.9 +272 -46 jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/ProbeLocator.java
Index: ProbeLocator.java
===================================================================
RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/ProbeLocator.java,v
retrieving revision 1.8
retrieving revision 1.9
diff -u -r1.8 -r1.9
--- ProbeLocator.java 20 Aug 2004 21:44:06 -0000 1.8
+++ ProbeLocator.java 31 Aug 2004 23:56:14 -0000 1.9
@@ -18,55 +18,145 @@
import org.apache.commons.feedparser.*;
-import java.io.*;
+import org.peerfear.newsmonster.network.*;
+
import java.util.*;
import java.util.regex.*;
+import java.net.*;
/**
- * Locator which uses Link probing
+ * Locator which uses Link probing. It also attempts to determine the type of
+ * blog service provider it is dealing with, such as BlogSpot, Blogsxom, etc.,
+ * in order to find feed URLs that are not specified through autodiscovery.
+ *
+ * If ProbeLocator.AGGRESIVE_PROBING_ENABLED is true (by default it is false),
+ * then we probe for links.
+ *
+ *
*
* @author <a href="mailto:burton@apache.org">Kevin A. Burton</a>
*/
public class ProbeLocator {
- static HashMap probeMapping = new HashMap();
-
- //FIXME: also just try common path names. Do this in ORDER so that I find
- //beter metadata feeds sooner
- //
- // /atom.xml
- // /atom.xml
- // /rss.xml
- // /index.xml
- // /index.rdf
+ /** If true, then we aggresively probe a site if it doesn't have
+ * autodiscovery. This includes trying to determine what the blog provider
+ * is, trying individual locations based on a blog provider, and probing
+ * in several locations if the blog provider is unknown.
+ *
+ * The default value for this should be false. This should only be
+ * used on server-side aggregators that generate few requests, and
+ * _never_ on client-side aggregators. The level of traffic for
+ * client-side aggregators would be too great.
+ */
+ public static boolean AGGRESIVE_PROBING_ENABLED = false;
+
+ /** If true, then after discovering what a site's blog provider is we
+ * probe in select locations for feeds based on the provider. This
+ * is useful if autodiscovery is not enabled on this blog and we don't
+ * want to do the full aggresive probing.
+ *
+ * The default value for this should be false. This should only
+ * be used on server-side aggregators that generate few requests,
+ * and _never_ on client-side aggregators. The level of traffic
+ * for client-side aggregators would be too great.
+ */
+ public static boolean BLOG_SERVICE_PROBING_ENABLED = false;
+
+ /**
+ * A regex to find any trailing filename and strip it
+ */
+ private static Pattern patternToStrip = Pattern.compile("/\\w*\\.\\w*$");
+
+ /**
+ * A regex to extract the user from a Xanga URL
+ */
+ private static Pattern xangaURLPattern = Pattern.compile(".*user=(\\w*)");
+
+ /**
+ * Internal map to store probe URIs and their services.
+ */
+ private static Map probeMapping = new HashMap();
static {
-
- // We can use:
- //
- // <meta name="generator" content="Blogger" />
- //
- // To determine if we're on blogger.
-
- //now resort to link probing
- //
- // /rss.xml (for radio blogs)
- // /index.rdf (for moveable type blogs
-
- //FIXME: Live Journal has both FOAF and Atom... We need support for BOTH
- probeMapping.put( "blogspot.com", "/atom.xml" );
- probeMapping.put( "livejournal.com", "/data/atom" );
-
- // FIXME:
+ /** Associates a given BlogService with a list of usual locations to find
+ * their RSS file. The locations are given as an array of FeedReferences,
+ * with highest quality feeds put first. These blog providers don't
+ * provide consistent autodiscovery.
+ */
+ FeedReference blosxomLocations[] =
+ { new FeedReference("index.rss20", FeedReference.RSS_MEDIA_TYPE),
+ new FeedReference("index.rss", FeedReference.RSS_MEDIA_TYPE) };
+
+ // Diaryland doesn't offer feeds
+ //FeedReference diaryLandLocations[] = { "" };
+ FeedReference bloggerLocations[] =
+ { new FeedReference("atom.xml", FeedReference.ATOM_MEDIA_TYPE) };
- // http://www.xanga.com/rss.aspx?user=username
+ FeedReference aolJournalLocations[] =
+ { new FeedReference("rss.xml", FeedReference.RSS_MEDIA_TYPE) };
- // http://www.xanga.com/home.aspx?user=speedysonic
+ FeedReference pmachineLocations[] =
+ { new FeedReference("index.xml", FeedReference.RSS_MEDIA_TYPE) };
- // which turns out to be RSS 0.91
+ FeedReference textPatternLocations[] =
+ { new FeedReference("?atom=1", FeedReference.ATOM_MEDIA_TYPE),
+ new FeedReference("?rss=1", FeedReference.RSS_MEDIA_TYPE) };
+
+ FeedReference manilaLocations[] =
+ { new FeedReference("xml/rss.xml", FeedReference.RSS_MEDIA_TYPE),
+ new FeedReference("rss.xml", FeedReference.RSS_MEDIA_TYPE) };
+
+ FeedReference typepadLocations[] =
+ { new FeedReference("atom.xml", FeedReference.ATOM_MEDIA_TYPE),
+ new FeedReference("index.rdf", FeedReference.RSS_MEDIA_TYPE) };
+
+ FeedReference radioUserlandLocations[] =
+ { new FeedReference("rss.xml", FeedReference.RSS_MEDIA_TYPE) };
+
+ FeedReference liveJournalLocations[] =
+ { new FeedReference("data/atom", FeedReference.ATOM_MEDIA_TYPE),
+ new FeedReference("data/rss", FeedReference.RSS_MEDIA_TYPE) };
+
+ FeedReference wordPressLocations[] =
+ { new FeedReference("wp-atom.php", FeedReference.ATOM_MEDIA_TYPE),
+ new FeedReference("wp-rss2.php", FeedReference.RSS_MEDIA_TYPE),
+ new FeedReference("wp-rss.php", FeedReference.RSS_MEDIA_TYPE) };
- //probeMapping.put( "livejournal.com", "/data/atom" );
+ FeedReference iBlogLocations[] =
+ { new FeedReference("rss.xml", FeedReference.RSS_MEDIA_TYPE) };
+
+ // Xanga feeds have to be handled specially since they put their
+ // feeds at the location: http://www.xanga.com/rss.aspx?user=username
+ FeedReference xangaLocations[] =
+ { new FeedReference("rss.aspx?user=", FeedReference.RSS_MEDIA_TYPE) };
+
+ FeedReference unknownLocations[] =
+ { new FeedReference("atom.xml",FeedReference.ATOM_MEDIA_TYPE),
+ new FeedReference("index.rss", FeedReference.RSS_MEDIA_TYPE),
+ new FeedReference("rss.xml", FeedReference.RSS_MEDIA_TYPE),
+ new FeedReference("index.rdf", FeedReference.RSS_MEDIA_TYPE),
+ new FeedReference("index.xml", FeedReference.XML_MEDIA_TYPE) };
+
+ probeMapping.put( BlogService.BLOSXOM, blosxomLocations );
+
+ //Tue Aug 31 2004 04:21 PM (burton@rojo.com): Diaryland doesn't
+ //currently offer RSS or Atom feeds (shame, shame, shame). This is a
+ //placeholder until they see the light and provide Atom feeds.
+
+ //probeMapping.put( BlogService.DIARYLAND, diaryLandLocations );
+ probeMapping.put( BlogService.BLOGGER, bloggerLocations );
+ probeMapping.put( BlogService.AOL_JOURNAL, aolJournalLocations );
+ probeMapping.put( BlogService.PMACHINE, pmachineLocations );
+ probeMapping.put( BlogService.TEXTPATTERN, textPatternLocations );
+ probeMapping.put( BlogService.MANILA, manilaLocations );
+ probeMapping.put( BlogService.TYPEPAD, typepadLocations );
+ probeMapping.put( BlogService.RADIO_USERLAND, radioUserlandLocations );
+ probeMapping.put( BlogService.LIVEJOURNAL, liveJournalLocations );
+ probeMapping.put( BlogService.WORDPRESS, wordPressLocations );
+ probeMapping.put( BlogService.IBLOG, iBlogLocations );
+ probeMapping.put( BlogService.XANGA, xangaLocations);
+ probeMapping.put( BlogService.UNKNOWN, unknownLocations );
}
/**
@@ -76,31 +166,167 @@
public static final List locate( String resource, String content, FeedList list )
throws Exception {
- //FIXME:
+ if ( BLOG_SERVICE_PROBING_ENABLED || AGGRESIVE_PROBING_ENABLED ) {
+
+ // determine what blog service we are dealing with
- String domain = ResourceExpander.getDomain( resource );
+ BlogService blogService = BlogServiceDiscovery.discover( resource, content );
+
+ String baseFeedPath = getFeedPath( resource );
- if ( probeMapping.containsKey( domain ) ) {
+ FeedReference mapping[] = null;
- String mapping = (String)probeMapping.get( domain );
+ HashSet previousAttempts = new HashSet();
- String href = resource;
-
- if ( href.endsWith( "/" ) )
- href = href.substring( 0, href.length() - 1 );
+ boolean feedFound = false;
+
+ if ( probeMapping.containsKey( blogService ) ) {
+
+ mapping = (FeedReference[])probeMapping.get( blogService );
- href += mapping;
+ // try out each mapping
+ for (int i = 0; i < mapping.length; i++) {
+ String pathToTest = baseFeedPath + mapping[i].resource;
+
+ // we have to do special probing for Xanga
+ if ( blogService.equals( BlogService.XANGA ) ) {
+ pathToTest += getXangaUser(resource);;
+ }
+
+ if ( feedExists( pathToTest ) ) {
+ FeedReference feedReference = new FeedReference( pathToTest,
+ mapping[i].type );
+ feedReference.method = FeedReference.METHOD_PROBE_DISCOVERY;
+
+ onFeedReference( feedReference, list );
+
+ feedFound = true;
+
+ }
+
+ // record this attempt so we don't repeat it again if
+ // we are doing aggresive probing
+ previousAttempts.add( pathToTest );
+ }
+ }
- FeedReference feedReference = new FeedReference( href,
- FeedReference.ATOM_MEDIA_TYPE );
+ // if we have nothing so far, do aggresive probing
+ if ( AGGRESIVE_PROBING_ENABLED && feedFound == false ) {
+
+ mapping = (FeedReference[])probeMapping.get( BlogService.UNKNOWN );
- list.add( feedReference );
- list.setAdAtomFeed( feedReference );
+ // try out each mapping
+ for (int i = 0; i < mapping.length; i++) {
+ //NOTE: this shares duplicate code with the above tests.
+
+ String pathToTest = baseFeedPath + mapping[i].resource;
+ if ( previousAttempts.contains( pathToTest ) == false ) {
+ if ( feedExists( pathToTest ) ) {
+
+ FeedReference feedReference = new FeedReference( pathToTest,
+ mapping[i].type);
+
+ feedReference.method = FeedReference.METHOD_PROBE_DISCOVERY;
+
+ onFeedReference( feedReference, list );
+
+ }
+ }
+ }
+ }
}
return list;
+ }
+
+ /**
+ * Called each time we find a feed so that we can set the Ad method.
+ *
+ * @author <a href="mailto:burton@rojo.com">Kevin A. Burton</a>
+ */
+ private static void onFeedReference( FeedReference ref, FeedList list ) {
+
+ if ( list.getAdAtomFeed() == null &&
+ FeedReference.ATOM_MEDIA_TYPE.equals( ref.type ) ) {
+
+ list.setAdAtomFeed( ref );
+
+ } else if ( list.getAdRSSFeed() == null &&
+ FeedReference.RSS_MEDIA_TYPE.equals( ref.type ) ) {
+
+ list.setAdRSSFeed( ref );
+
+ }
+
+ list.add( ref );
+
+ }
+
+ /** This method takes a resource, such as "http://www.codinginparadise.org/myweblog.php",
+ * and gets the path necessary to build up a feed, such as
+ * "http://www.codinginparadise.org/". Basicly it appends a slash to the end if there
+ * is not one, and removes any file names that might be at the end, such as
+ * "myweblog.php".
+ *
+ * @throws MalformedURLException Thrown if the given resource's URL is incorrectly
+ * formatted.
+ *
+ * @author Brad Neuberg, bkn3@columbia.edu
+ */
+ protected static String getFeedPath(String resource)
+ throws MalformedURLException {
+
+ // strip off any query string or anchors
+ int startStripAt = resource.indexOf("#");
+
+ if ( startStripAt == -1 ) {
+ startStripAt = resource.indexOf("?");
+ }
+ if (startStripAt != -1) {
+ resource = resource.substring(0, startStripAt);
+ }
+
+ Matcher fileMatcher = patternToStrip.matcher(resource);
+ resource = fileMatcher.replaceAll("");
+
+ if ( !resource.endsWith( "/" ) ) {
+ resource = resource + "/";
+ }
+
+ return resource;
+ }
+
+ /** Does an HTTP HEAD to see if the given resource exists.
+ *
+ * @param resource The full URI to the resource to check for.
+ *
+ * @author Brad Neuberg, bkn3@columbia.edu
+ */
+ protected static boolean feedExists(String resource) throws Exception {
+ ResourceRequest request = ResourceRequestFactory.getResourceRequest( resource );
+
+ request.setRequestMethod( "HEAD" );
+ request.setFollowRedirects( false );
+
+ // the call below actually causes the connection to be made
+ request.getContentLength();
+
+ long response = request.getResponseCode();
+
+ return response == 200;
+ }
+
+ /** Xanga's feed locations are dependent on the 'user' attribute in a
+ * Xanga URI. This method helps extract the user element from an
+ * existing URI, such as http://www.xanga.com/home.aspx?user=wdfphillz.
+ */
+ protected static String getXangaUser(String resource) {
+ Matcher xangaMatcher = xangaURLPattern.matcher(resource);
+ xangaMatcher.matches();
+
+ return xangaMatcher.group(1);
}
public static void main( String[] args ) throws Exception {
---------------------------------------------------------------------
To unsubscribe, e-mail: commons-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: commons-dev-help@jakarta.apache.org