You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@commons.apache.org by bu...@apache.org on 2004/09/01 01:56:14 UTC

cvs commit: jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate FeedLocator.java LinkLocator.java ProbeLocator.java

burton      2004/08/31 16:56:14

  Modified:    feedparser build.xml
               feedparser/src/java/org/apache/commons/feedparser/locate
                        FeedLocator.java LinkLocator.java ProbeLocator.java
  Log:
  patches from Brad Neuberg to perform more advanced probe location... added some cleanup around new feed refs
  
  Revision  Changes    Path
  1.5       +37 -1     jakarta-commons-sandbox/feedparser/build.xml
  
  Index: build.xml
  ===================================================================
  RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/build.xml,v
  retrieving revision 1.4
  retrieving revision 1.5
  diff -u -r1.4 -r1.5
  --- build.xml	4 Jul 2004 06:31:10 -0000	1.4
  +++ build.xml	31 Aug 2004 23:56:13 -0000	1.5
  @@ -29,10 +29,16 @@
           </fileset>
   
           <fileset dir="/projects/ksa/lib">
  +             <include name="*.jar"/>
  +        </fileset>
  +
  +        <!--
  +        <fileset dir="c:/usr/local/ksa/lib">
   
                <include name="*.jar"/>
   
           </fileset>
  +         -->
   
       </path>
   
  @@ -41,12 +47,39 @@
           <mkdir dir="compile"/>
   
           <javac srcdir="src/java/" 
  -               compiler="jikes"
                  classpathref="project.classpath"
                  destdir="compile" 
                  debug="true"/>
   
       </target>
  +    
  +    <target name="run-example" description="Runs the example feed parser class"
  +            depends="jakarta-feedparser.jar">
  +       <java classpathref="project.classpath"
  +             classname="org.apache.commons.feedparser.example.HelloFeedParser"
  +             fork="true"
  +             failonerror="true">
  +       </java>        
  +    </target>
  +    
  +    <target name="debug-feed" description="Debugs a feed"
  +            depends="jakarta-feedparser.jar">
  +       <java classpathref="project.classpath"
  +             classname="org.apache.commons.feedparser.Main"
  +             fork="true"
  +             failonerror="true">
  +          <arg value="${feed-url}"/>
  +       </java>        
  +    </target>
  +    
  +    <target name="test-autodiscover" description="Runs the testing class for autodiscovery"
  +            depends="jakarta-feedparser.jar">
  +       <java classpathref="project.classpath"
  +             classname="org.apache.commons.feedparser.locate.FeedLocator"
  +             fork="true"
  +             failonerror="true">
  +       </java>        
  +    </target>
   
       <target name="clean">
           <delete dir="compile"/>
  @@ -83,6 +116,7 @@
                  
               <formatter type="plain" usefile="false"/>
   
  +            <test name="org.apache.commons.feedparser.test.TestProbeLocator"/>
               <test name="org.apache.commons.feedparser.test.TestAtom"/>
               <test name="org.apache.commons.feedparser.test.TestFeedParserUTF8"/>
   
  @@ -93,7 +127,9 @@
       </target>
   
       <target name="javadoc">
  +
           <mkdir dir="${build.dir}/docs/api"/>
  +
           <javadoc sourcepath="src/java"
                    packagenames="org.apache.commons.feedparser.*"
                    destdir="docs/api"
  
  
  
  1.18      +17 -5     jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/FeedLocator.java
  
  Index: FeedLocator.java
  ===================================================================
  RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/FeedLocator.java,v
  retrieving revision 1.17
  retrieving revision 1.18
  diff -u -r1.17 -r1.18
  --- FeedLocator.java	20 Aug 2004 21:44:06 -0000	1.17
  +++ FeedLocator.java	31 Aug 2004 23:56:14 -0000	1.18
  @@ -20,18 +20,30 @@
   
   import org.peerfear.newsmonster.network.*;
   
  -import java.io.*;
   import java.util.*;
   
   /**
  - * Method to determine feed URLs from a given. 
  + * Method to determine feed URLs from a given resource URI.  For example,
  + * you would pass in the URI:
  + * 
  + * http://www.codinginparadise.org
  + * 
  + * and this class would pass back a List with one address of the feed URL,
  + * which is
  + * 
  + * http://www.codinginparadise.org/weblog/atom.xml"
  + *
  + * <code>
  + * String resource = "http://www.codinginparadise.org";
  + * FeedList l = FeedLocator.locate( resource );
  + * </code>
    * 
    * @author <a href="mailto:burton@apache.org">Kevin A. Burton</a>
    */
   public class FeedLocator {
  -
  +    
       /**
  -     * Locate all feeds within the give resource.  The resource should be a link
  +     * Locate all feeds within the given resource.  The resource should be a link
        * to an (X)HTML document, usually a weblog or a website.
        * 
        * Example: http://peerfear.org
  @@ -66,7 +78,7 @@
   
           //this failed... try looking for links
           LinkLocator.locate( resource, content, list );
  -
  +        
           //this failed... try probe location.  This is more reliable than
           //LinkLocation but requires a few more HTTP gets.
           if ( list.size() == 0 )
  
  
  
  1.8       +1 -1      jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/LinkLocator.java
  
  Index: LinkLocator.java
  ===================================================================
  RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/LinkLocator.java,v
  retrieving revision 1.7
  retrieving revision 1.8
  diff -u -r1.7 -r1.8
  --- LinkLocator.java	31 Aug 2004 22:35:52 -0000	1.7
  +++ LinkLocator.java	31 Aug 2004 23:56:14 -0000	1.8
  @@ -140,7 +140,7 @@
                       if ( current.endsWith( ".atom" ) ) {
   
                           FeedReference ref = new FeedReference( current,
  -                                                               FeedReference.ATOM_MEDIA_TYPE );
  +                                                               FeedReference.RSS_MEDIA_TYPE );
   
                           //Make sure to preserve existing AD feeds first.
                           if ( ! hasExplicitAtomFeed )
  
  
  
  1.9       +272 -46   jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/ProbeLocator.java
  
  Index: ProbeLocator.java
  ===================================================================
  RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/ProbeLocator.java,v
  retrieving revision 1.8
  retrieving revision 1.9
  diff -u -r1.8 -r1.9
  --- ProbeLocator.java	20 Aug 2004 21:44:06 -0000	1.8
  +++ ProbeLocator.java	31 Aug 2004 23:56:14 -0000	1.9
  @@ -18,55 +18,145 @@
   
   import org.apache.commons.feedparser.*;
   
  -import java.io.*;
  +import org.peerfear.newsmonster.network.*;
  +
   import java.util.*;
   import java.util.regex.*;
  +import java.net.*;
   
   /**
  - * Locator which uses Link probing
  + * Locator which uses Link probing.  It also attempts to determine the type of
  + * blog service provider it is dealing with, such as BlogSpot, Blogsxom, etc.,
  + * in order to find feed URLs that are not specified through autodiscovery.
  + * 
  + * If ProbeLocator.AGGRESIVE_PROBING_ENABLED is true (by default it is false),
  + * then we probe for links.
  + * 
  + * 
    * 
    * @author <a href="mailto:burton@apache.org">Kevin A. Burton</a>
    */
   public class ProbeLocator {
   
  -    static HashMap probeMapping = new HashMap();
  -
  -    //FIXME: also just try common path names.  Do this in ORDER so that I find
  -    //beter metadata feeds sooner
  -    //
  -    //      /atom.xml
  -    //      /atom.xml
  -    //      /rss.xml
  -    //      /index.xml
  -    //      /index.rdf
  +    /** If true, then we aggresively probe a site if it doesn't have
  +     *  autodiscovery.  This includes trying to determine what the blog provider
  +     *  is, trying individual locations based on a blog provider, and probing
  +     *  in several locations if the blog provider is unknown.
  +     * 
  +     *  The default value for this should be false.  This should only be 
  +     *  used on server-side aggregators that generate few requests, and 
  +     *  _never_ on client-side aggregators.  The level of traffic for 
  +     *  client-side aggregators would be too great.
  +     */
  +    public static boolean AGGRESIVE_PROBING_ENABLED = false;
  +    
  +    /** If true, then after discovering what a site's blog provider is we
  +     *  probe in select locations for feeds based on the provider.  This
  +     *  is useful if autodiscovery is not enabled on this blog and we don't
  +     *  want to do the full aggresive probing.
  +     * 
  +     *  The default value for this should be false.  This should only 
  +     *  be used on server-side aggregators that generate few requests, 
  +     *  and _never_ on client-side aggregators.  The level of traffic 
  +     *  for client-side aggregators would be too great.
  +     */
  +    public static boolean BLOG_SERVICE_PROBING_ENABLED = false;
  +    
  +    /**
  +     * A regex to find any trailing filename and strip it
  +     */
  +    private static Pattern patternToStrip = Pattern.compile("/\\w*\\.\\w*$");
  +    
  +    /**
  +     * A regex to extract the user from a Xanga URL
  +     */
  +    private static Pattern xangaURLPattern = Pattern.compile(".*user=(\\w*)");
  +    
  +    /**
  +     * Internal map to store probe URIs and their services.
  +     */
  +    private static Map probeMapping = new HashMap();
       
       static {
  -
  -        // We can use:
  -        //
  -        // <meta name="generator" content="Blogger" />
  -        //
  -        // To determine if we're on blogger.
  -
  -        //now resort to link probing
  -        //
  -        // /rss.xml (for radio blogs)
  -        // /index.rdf (for moveable type blogs
  -
  -        //FIXME: Live Journal has both FOAF and Atom...  We need support for BOTH
  -        probeMapping.put( "blogspot.com", "/atom.xml" );
  -        probeMapping.put( "livejournal.com", "/data/atom" );
  -
  -        // FIXME: 
  +        /** Associates a given BlogService with a list of usual locations to find
  +         *  their RSS file.  The locations are given as an array of FeedReferences,
  +         *  with highest quality feeds put first.  These blog providers don't
  +         *  provide consistent autodiscovery.
  +         */
  +        FeedReference blosxomLocations[] =
  +            { new FeedReference("index.rss20", FeedReference.RSS_MEDIA_TYPE),
  +              new FeedReference("index.rss", FeedReference.RSS_MEDIA_TYPE) };
  +
  +        // Diaryland doesn't offer feeds
  +        //FeedReference diaryLandLocations[] =       { "" };
  +        FeedReference bloggerLocations[] =
  +            { new FeedReference("atom.xml", FeedReference.ATOM_MEDIA_TYPE) };
           
  -        // http://www.xanga.com/rss.aspx?user=username
  +        FeedReference aolJournalLocations[] =
  +            { new FeedReference("rss.xml", FeedReference.RSS_MEDIA_TYPE) };
   
  -        // http://www.xanga.com/home.aspx?user=speedysonic
  +        FeedReference pmachineLocations[] =
  +            { new FeedReference("index.xml", FeedReference.RSS_MEDIA_TYPE) };
   
  -        // which turns out to be RSS 0.91
  +        FeedReference textPatternLocations[] =
  +            { new FeedReference("?atom=1", FeedReference.ATOM_MEDIA_TYPE),
  +              new FeedReference("?rss=1", FeedReference.RSS_MEDIA_TYPE) };
  +        
  +        FeedReference manilaLocations[] =
  +            { new FeedReference("xml/rss.xml", FeedReference.RSS_MEDIA_TYPE),
  +              new FeedReference("rss.xml", FeedReference.RSS_MEDIA_TYPE) };
  +        
  +        FeedReference typepadLocations[] =
  +            { new FeedReference("atom.xml", FeedReference.ATOM_MEDIA_TYPE),
  +              new FeedReference("index.rdf", FeedReference.RSS_MEDIA_TYPE) };
  +        
  +        FeedReference radioUserlandLocations[] =
  +            { new FeedReference("rss.xml", FeedReference.RSS_MEDIA_TYPE) };
  +        
  +        FeedReference liveJournalLocations[] =
  +            { new FeedReference("data/atom", FeedReference.ATOM_MEDIA_TYPE),
  +              new FeedReference("data/rss", FeedReference.RSS_MEDIA_TYPE) };
  +        
  +        FeedReference wordPressLocations[] =
  +            { new FeedReference("wp-atom.php", FeedReference.ATOM_MEDIA_TYPE),
  +              new FeedReference("wp-rss2.php", FeedReference.RSS_MEDIA_TYPE),
  +              new FeedReference("wp-rss.php", FeedReference.RSS_MEDIA_TYPE) };
           
  -        //probeMapping.put( "livejournal.com", "/data/atom" );
  +        FeedReference iBlogLocations[] =
  +            { new FeedReference("rss.xml", FeedReference.RSS_MEDIA_TYPE) };
  +        
  +        // Xanga feeds have to be handled specially since they put their
  +        // feeds at the location: http://www.xanga.com/rss.aspx?user=username
  +        FeedReference xangaLocations[] =
  +            { new FeedReference("rss.aspx?user=", FeedReference.RSS_MEDIA_TYPE) };
  +        
  +        FeedReference unknownLocations[] =
  +            { new FeedReference("atom.xml",FeedReference.ATOM_MEDIA_TYPE),
  +              new FeedReference("index.rss", FeedReference.RSS_MEDIA_TYPE),
  +              new FeedReference("rss.xml", FeedReference.RSS_MEDIA_TYPE),
  +              new FeedReference("index.rdf", FeedReference.RSS_MEDIA_TYPE), 
  +              new FeedReference("index.xml", FeedReference.XML_MEDIA_TYPE) };
  +              
  +        probeMapping.put( BlogService.BLOSXOM,        blosxomLocations );
  +
  +        //Tue Aug 31 2004 04:21 PM (burton@rojo.com): Diaryland doesn't
  +        //currently offer RSS or Atom feeds (shame, shame, shame).  This is a
  +        //placeholder until they see the light and provide Atom feeds.
  +        
  +        //probeMapping.put( BlogService.DIARYLAND,      diaryLandLocations );
   
  +        probeMapping.put( BlogService.BLOGGER,        bloggerLocations  ); 
  +        probeMapping.put( BlogService.AOL_JOURNAL,    aolJournalLocations );  
  +        probeMapping.put( BlogService.PMACHINE,       pmachineLocations );
  +        probeMapping.put( BlogService.TEXTPATTERN,    textPatternLocations );  
  +        probeMapping.put( BlogService.MANILA,         manilaLocations );   
  +        probeMapping.put( BlogService.TYPEPAD,        typepadLocations );
  +        probeMapping.put( BlogService.RADIO_USERLAND, radioUserlandLocations );
  +        probeMapping.put( BlogService.LIVEJOURNAL,    liveJournalLocations );
  +        probeMapping.put( BlogService.WORDPRESS,      wordPressLocations );
  +        probeMapping.put( BlogService.IBLOG,          iBlogLocations );
  +        probeMapping.put( BlogService.XANGA,          xangaLocations);
  +        probeMapping.put( BlogService.UNKNOWN,        unknownLocations );
       }
       
       /**
  @@ -76,31 +166,167 @@
       public static final List locate( String resource, String content, FeedList list )
           throws Exception {
   
  -        //FIXME: 
  +        if ( BLOG_SERVICE_PROBING_ENABLED || AGGRESIVE_PROBING_ENABLED ) {
  +
  +            // determine what blog service we are dealing with
   
  -        String domain = ResourceExpander.getDomain( resource );
  +            BlogService blogService = BlogServiceDiscovery.discover( resource, content );
  +            
  +            String baseFeedPath = getFeedPath( resource );
   
  -        if ( probeMapping.containsKey( domain ) ) {
  +            FeedReference mapping[] = null;
   
  -            String mapping = (String)probeMapping.get( domain );
  +            HashSet previousAttempts = new HashSet();
   
  -            String href = resource;
  -            
  -            if ( href.endsWith( "/" ) )
  -                href = href.substring( 0, href.length() - 1 );
  +            boolean feedFound = false;
  +    
  +            if ( probeMapping.containsKey( blogService ) ) {
  +
  +                mapping = (FeedReference[])probeMapping.get( blogService );
                   
  -            href += mapping;
  +                // try out each mapping
  +                for (int i = 0; i < mapping.length; i++) {
  +                    String pathToTest = baseFeedPath + mapping[i].resource;
  +                    
  +                    // we have to do special probing for Xanga
  +                    if ( blogService.equals( BlogService.XANGA ) ) {
  +                        pathToTest += getXangaUser(resource);;
  +                    }
  +                    
  +                    if ( feedExists( pathToTest ) ) {
  +                        FeedReference feedReference = new FeedReference( pathToTest,
  +                                                                         mapping[i].type );
  +                        feedReference.method = FeedReference.METHOD_PROBE_DISCOVERY;
  +
  +                        onFeedReference( feedReference, list );
  +
  +                        feedFound = true;
  +
  +                    }
  +                    
  +                    // record this attempt so we don't repeat it again if
  +                    // we are doing aggresive probing
  +                    previousAttempts.add( pathToTest );
  +                }
  +            }
               
  -            FeedReference feedReference = new FeedReference( href,
  -                                                             FeedReference.ATOM_MEDIA_TYPE );
  +            // if we have nothing so far, do aggresive probing
  +            if ( AGGRESIVE_PROBING_ENABLED && feedFound == false ) {
  +
  +                mapping = (FeedReference[])probeMapping.get( BlogService.UNKNOWN );
                   
  -            list.add( feedReference );
  -            list.setAdAtomFeed( feedReference );
  +                // try out each mapping
  +                for (int i = 0; i < mapping.length; i++) {
   
  +                    //NOTE: this shares duplicate code with the above tests.
  +                    
  +                    String pathToTest = baseFeedPath + mapping[i].resource;
  +                    if ( previousAttempts.contains( pathToTest ) == false ) {
  +                        if ( feedExists( pathToTest ) ) {
  +
  +                            FeedReference feedReference = new FeedReference( pathToTest,
  +                                                                             mapping[i].type);
  +                            
  +                            feedReference.method = FeedReference.METHOD_PROBE_DISCOVERY;
  +
  +                            onFeedReference( feedReference, list );
  +
  +                        }
  +                    }
  +                }
  +            }
           }
   
           return list;
   
  +    }
  +
  +    /**
  +     * Called each time we find a feed so that we can set the Ad method.
  +     *
  +     * @author <a href="mailto:burton@rojo.com">Kevin A. Burton</a>
  +     */
  +    private static void onFeedReference( FeedReference ref, FeedList list ) {
  +
  +        if ( list.getAdAtomFeed() == null &&
  +             FeedReference.ATOM_MEDIA_TYPE.equals( ref.type ) ) {
  +
  +            list.setAdAtomFeed( ref );
  +
  +        } else if ( list.getAdRSSFeed() == null &&
  +                    FeedReference.RSS_MEDIA_TYPE.equals( ref.type ) ) {
  +
  +            list.setAdRSSFeed( ref );
  +
  +        }
  +
  +        list.add( ref );
  +        
  +    }
  +                                         
  +    /** This method takes a resource, such as "http://www.codinginparadise.org/myweblog.php",
  +     *  and gets the path necessary to build up a feed, such as 
  +     *  "http://www.codinginparadise.org/".  Basicly it appends a slash to the end if there
  +     *  is not one, and removes any file names that might be at the end, such as 
  +     *  "myweblog.php".
  +     * 
  +     *  @throws MalformedURLException Thrown if the given resource's URL is incorrectly
  +     *  formatted.
  +     * 
  +     *  @author Brad Neuberg, bkn3@columbia.edu
  +     */
  +    protected static String getFeedPath(String resource) 
  +        throws MalformedURLException {
  +        
  +        // strip off any query string or anchors
  +        int startStripAt = resource.indexOf("#");
  +
  +        if ( startStripAt == -1 ) {
  +        	startStripAt = resource.indexOf("?");
  +        }
  +        if (startStripAt != -1) {
  +            resource = resource.substring(0, startStripAt);
  +        }
  +
  +        Matcher fileMatcher = patternToStrip.matcher(resource);
  +        resource = fileMatcher.replaceAll("");
  +    	
  +        if ( !resource.endsWith( "/" ) ) {
  +            resource = resource + "/";
  +        }
  +        
  +        return resource;
  +    }
  +    
  +    /** Does an HTTP HEAD to see if the given resource exists.
  +     * 
  +     *  @param resource The full URI to the resource to check for.
  +     * 
  +     *  @author Brad Neuberg, bkn3@columbia.edu
  +     */
  +    protected static boolean feedExists(String resource) throws Exception {
  +        ResourceRequest request = ResourceRequestFactory.getResourceRequest( resource );
  +
  +        request.setRequestMethod( "HEAD" );
  +        request.setFollowRedirects( false );
  +        
  +        // the call below actually causes the connection to be made
  +        request.getContentLength();
  +        
  +        long response = request.getResponseCode();
  +        
  +        return response == 200;
  +    }
  +    
  +    /** Xanga's feed locations are dependent on the 'user' attribute in a
  +     *  Xanga URI.  This method helps extract the user element from an 
  +     *  existing URI, such as http://www.xanga.com/home.aspx?user=wdfphillz.
  +     */
  +    protected static String getXangaUser(String resource) {
  +        Matcher xangaMatcher = xangaURLPattern.matcher(resource);
  +        xangaMatcher.matches();
  +        
  +        return xangaMatcher.group(1);
       }
   
       public static void main( String[] args ) throws Exception {
  
  
  

---------------------------------------------------------------------
To unsubscribe, e-mail: commons-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: commons-dev-help@jakarta.apache.org