You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@commons.apache.org by bu...@apache.org on 2004/04/15 02:58:44 UTC

cvs commit: jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate AnchorParser.java AnchorParserListener.java EntityDecoder.java DiscoveryLocator.java FeedLocator.java FeedReference.java

burton      2004/04/14 17:58:44

  Modified:    feedparser TODO
               feedparser/src/java/org/apache/commons/feedparser
                        FeedParser.java LinkFeedParserListener.java
                        MetaFeedParserListener.java NS.java
                        RSSFeedParser.java
               feedparser/src/java/org/apache/commons/feedparser/locate
                        DiscoveryLocator.java FeedLocator.java
                        FeedReference.java
  Added:       feedparser/src/java/org/apache/commons/feedparser/locate
                        AnchorParser.java AnchorParserListener.java
                        EntityDecoder.java
  Log:
  init support for RSS location
  
  Revision  Changes    Path
  1.7       +5 -1      jakarta-commons-sandbox/feedparser/TODO
  
  Index: TODO
  ===================================================================
  RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/TODO,v
  retrieving revision 1.6
  retrieving revision 1.7
  diff -u -r1.6 -r1.7
  --- TODO	2 Mar 2004 08:27:52 -0000	1.6
  +++ TODO	15 Apr 2004 00:58:43 -0000	1.7
  @@ -42,4 +42,8 @@
   - What are my options for XPATH libraries... I should benchmark these..  Also
     see if using SAX will just be faster.
   
  -- Full documentation on how we have to handle dates.
  \ No newline at end of file
  +- Full documentation on how we have to handle dates.
  +
  +- Support textinput
  +
  +- All FeedParser exceptions should include the URL of the feed if possible.
  
  
  
  1.4       +16 -18    jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/FeedParser.java
  
  Index: FeedParser.java
  ===================================================================
  RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/FeedParser.java,v
  retrieving revision 1.3
  retrieving revision 1.4
  diff -u -r1.3 -r1.4
  --- FeedParser.java	13 Mar 2004 07:05:57 -0000	1.3
  +++ FeedParser.java	15 Apr 2004 00:58:44 -0000	1.4
  @@ -42,27 +42,15 @@
   public class FeedParser {
   
       /**
  -     *
  -     * The mediaType can be one of the following:
  -     *
  -     * <li>application/rss+xml</li>
  -     * <li>application/atom+xml</li>
  -     * 
  -     * @author <a href="mailto:burton@peerfear.org">Kevin Burton</a>
  -     */
  -    public static void parse( FeedParserListener listener,
  -                              InputStream is,
  -                              String mediaType) throws FeedParserException {
  -
  -    }
  -
  -    /**
        * Parse this feed.
  +     * 
  +     * @param resource The URL of the feed being parsed.  This is optional and
  +     * may be null but is used when an exception is thrown to aid debugging.
        *
  -     * @author <a href="mailto:burton@peerfear.org">Kevin A. Burton</a>
        */
       public static void parse( FeedParserListener listener,
  -                              InputStream is ) throws FeedParserException {
  +                              InputStream is ,
  +                              String resource ) throws FeedParserException {
   
           try { 
               
  @@ -76,6 +64,16 @@
               //if an explicit FeedParserException is thrown just rethrow it..
               throw fpe;
           } catch ( Throwable t ) { throw new FeedParserException( t ); }
  +
  +    }
  +
  +    /**
  +     * @deprecated Use #parse( FeedParserException, InputStream, String )
  +     */
  +    public static void parse( FeedParserListener listener,
  +                              InputStream is ) throws FeedParserException {
  +
  +        parse( listener, is, null );
           
       }
   
  
  
  
  1.3       +5 -3      jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/LinkFeedParserListener.java
  
  Index: LinkFeedParserListener.java
  ===================================================================
  RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/LinkFeedParserListener.java,v
  retrieving revision 1.2
  retrieving revision 1.3
  diff -u -r1.2 -r1.3
  --- LinkFeedParserListener.java	28 Feb 2004 03:35:22 -0000	1.2
  +++ LinkFeedParserListener.java	15 Apr 2004 00:58:44 -0000	1.3
  @@ -19,7 +19,7 @@
   import org.jdom.*;
   
   /**
  - * Atom link and RSS 1.0 mod_link support.
  + * Atom link, RSS 2.0 enclosure, and RSS 1.0 mod_link support.
    * 
    * @author <a href="mailto:burton@apache.org">Kevin A. Burton (burtonator)</a>
    * @version $Id$
  @@ -28,12 +28,14 @@
   
       /**
        *
  +     * @param length The length of the content or -1 if not specified.
        */
       public void onLink( FeedParserState state,
                           String rel,
                           String type,
                           String href,
  -                        String title ) throws FeedParserException;
  +                        String title,
  +                        long length ) throws FeedParserException;
   
   }
   
  
  
  
  1.5       +5 -1      jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/MetaFeedParserListener.java
  
  Index: MetaFeedParserListener.java
  ===================================================================
  RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/MetaFeedParserListener.java,v
  retrieving revision 1.4
  retrieving revision 1.5
  diff -u -r1.4 -r1.5
  --- MetaFeedParserListener.java	13 Mar 2004 07:05:57 -0000	1.4
  +++ MetaFeedParserListener.java	15 Apr 2004 00:58:44 -0000	1.5
  @@ -83,7 +83,11 @@
       public void onIssued( FeedParserState state, String content ) throws FeedParserException; 
       public void onIssuedEnd() throws FeedParserException;
   
  +    /**
  +     * RSS 2.0 category.  Dublin Core.
  +     */
       public void onSubject( FeedParserState state, String content ) throws FeedParserException;
  +
       public void onSubjectEnd() throws FeedParserException;
       
   }
  
  
  
  1.3       +1 -13     jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/NS.java
  
  Index: NS.java
  ===================================================================
  RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/NS.java,v
  retrieving revision 1.2
  retrieving revision 1.3
  diff -u -r1.2 -r1.3
  --- NS.java	28 Feb 2004 03:35:22 -0000	1.2
  +++ NS.java	15 Apr 2004 00:58:44 -0000	1.3
  @@ -54,23 +54,11 @@
       public static final Namespace DCTERMS =
           Namespace.getNamespace( "dcterms", "http://purl.org/dc/terms/" );
   
  -    public static final Namespace MARKED =
  -        Namespace.getNamespace( "marked", "http://newsmonster.org/schemas/marked/" );
  -
       public static final Namespace SUBSCRIPTION =
           Namespace.getNamespace( "sub", "http://purl.org/rss/1.0/modules/subscription/" );
   
       public static final Namespace NC =
           Namespace.getNamespace( "NC", "http://home.netscape.com/NC-rdf#" );
  -
  -    public static final Namespace NM =
  -        Namespace.getNamespace( "nm", "http://newsmonster.org/nm-rdf#" );
  -
  -    public static final Namespace BLOGMETRIC =
  -        Namespace.getNamespace( "blogmetric", "http://newsmonster.org/src/1.0/blogmetric/" );
  -
  -    public static final Namespace IMPLICIT =
  -        Namespace.getNamespace( "implicit", "http://newsmonster.org/scf/1.0/modules/implicit/" );
   
       public static final Namespace XHTML =
           Namespace.getNamespace( "xhtml", "http://www.w3.org/1999/xhtml" );
  
  
  
  1.4       +4 -1      jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/RSSFeedParser.java
  
  Index: RSSFeedParser.java
  ===================================================================
  RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/RSSFeedParser.java,v
  retrieving revision 1.3
  retrieving revision 1.4
  diff -u -r1.3 -r1.4
  --- RSSFeedParser.java	28 Feb 2004 03:35:22 -0000	1.3
  +++ RSSFeedParser.java	15 Apr 2004 00:58:44 -0000	1.4
  @@ -148,6 +148,9 @@
           xpath.addNamespace( NS.RDF.getPrefix(), NS.RDF.getURI() );
           Object node = xpath.selectSingleNode( state.current );
   
  +        //FIXME: if this is a GUID and isPermalink=false don't use it as the
  +        //parmalink.
  +        
           if ( node instanceof Element ) {
               resource = ((Element)node).getText();
           } else if ( node instanceof Attribute ) {
  
  
  
  1.3       +1 -0      jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/DiscoveryLocator.java
  
  Index: DiscoveryLocator.java
  ===================================================================
  RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/DiscoveryLocator.java,v
  retrieving revision 1.2
  retrieving revision 1.3
  diff -u -r1.2 -r1.3
  --- DiscoveryLocator.java	28 Feb 2004 03:35:22 -0000	1.2
  +++ DiscoveryLocator.java	15 Apr 2004 00:58:44 -0000	1.3
  @@ -19,6 +19,7 @@
   import java.io.*;
   import java.util.*;
   
  +//FIXME: do NOT use apache regex as it has major problems.
   import org.apache.regexp.*;
   
   /**
  
  
  
  1.3       +5 -0      jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/FeedLocator.java
  
  Index: FeedLocator.java
  ===================================================================
  RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/FeedLocator.java,v
  retrieving revision 1.2
  retrieving revision 1.3
  diff -u -r1.2 -r1.3
  --- FeedLocator.java	28 Feb 2004 03:35:22 -0000	1.2
  +++ FeedLocator.java	15 Apr 2004 00:58:44 -0000	1.3
  @@ -33,6 +33,7 @@
        * 
        * Example: http://peerfear.org
        *
  +     * @param resource The weblog we need to discover
        * @author <a href="mailto:burton@peerfear.org">Kevin A. Burton</a>
        */
       public static final List locate( String resource ) throws Exception {
  @@ -59,6 +60,10 @@
           LinkedList list = new LinkedList();
   
           DiscoveryLocator.locate( resource, content, list );
  +
  +        //FIXME: if we faile to locate with location with link discovery.
  +
  +        //FIXME: if we still fail try location link probing /index.rdf, /index.xml
           
           return list;
           
  
  
  
  1.3       +0 -2      jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/FeedReference.java
  
  Index: FeedReference.java
  ===================================================================
  RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/FeedReference.java,v
  retrieving revision 1.2
  retrieving revision 1.3
  diff -u -r1.2 -r1.3
  --- FeedReference.java	28 Feb 2004 03:35:22 -0000	1.2
  +++ FeedReference.java	15 Apr 2004 00:58:44 -0000	1.3
  @@ -16,8 +16,6 @@
   
   package org.apache.commons.feedparser.locate;
   
  -import org.peerfear.newsmonster.network.*;
  -
   import java.io.*;
   import java.util.*;
   
  
  
  
  1.1                  jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/AnchorParser.java
  
  Index: AnchorParser.java
  ===================================================================
  /*
   * Copyright 1999,2004 The Apache Software Foundation.
   * 
   * Licensed under the Apache License, Version 2.0 (the "License");
   * you may not use this file except in compliance with the License.
   * You may obtain a copy of the License at
   * 
   *      http://www.apache.org/licenses/LICENSE-2.0
   * 
   * Unless required by applicable law or agreed to in writing, software
   * distributed under the License is distributed on an "AS IS" BASIS,
   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   * See the License for the specific language governing permissions and
   * limitations under the License.
   */
  
  package org.apache.commons.feedparser.locate;
  
  import java.io.*;
  import java.net.*;
  import java.util.*;
  import java.util.regex.*;
  
  /**
   *
   * Given HTML pull out an array of anchors
   *
   * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
   */
  public class AnchorParser {
  
      public static final String LINK_REGEXP = "<a [^>]*href=[\"']?([^\">']+)[\"']?[^>]*>([^<]+)";
  
      static Pattern pattern = Pattern.compile( LINK_REGEXP,
                                                Pattern.CASE_INSENSITIVE | Pattern.MULTILINE );
      
      /**
       * Get links from the given html with included titles and other metainfo.
       *
       * @deprecated use HTParser
       * @author <a href="mailto:burton@peerfear.org">Kevin A. Burton</a>
       */
      public static void parseAnchors( String content, AnchorParserListener listener ) {
  
          int index = 0;
  
          //FIXME: what if href isn't the first attribute?  It will fail here...
  
          Matcher m = pattern.matcher( content );
          
          LinkedList list = new LinkedList();
          
          while ( m.find() ) {
  
              //expand this link
  
              String resource = EntityDecoder.decode( m.group( 1 ) );
              String title = EntityDecoder.decode( m.group( 2 ).trim() );
  
              listener.onAnchor( resource, null, title );
  
          } 
  
      }
  
  }
  
  
  
  1.1                  jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/AnchorParserListener.java
  
  Index: AnchorParserListener.java
  ===================================================================
  /*
   * Copyright 1999,2004 The Apache Software Foundation.
   * 
   * Licensed under the Apache License, Version 2.0 (the "License");
   * you may not use this file except in compliance with the License.
   * You may obtain a copy of the License at
   * 
   *      http://www.apache.org/licenses/LICENSE-2.0
   * 
   * Unless required by applicable law or agreed to in writing, software
   * distributed under the License is distributed on an "AS IS" BASIS,
   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   * See the License for the specific language governing permissions and
   * limitations under the License.
   */
  
  package org.apache.commons.feedparser.locate;
  
  import java.io.*;
  import java.net.*;
  import java.util.*;
  import java.util.regex.*;
  
  /**
   *
   * Given HTML pull out an array of anchors
   *
   * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
   */
  public interface AnchorParserListener {
  
      public void setContext( Object context );
  
      public void onAnchor( String href, String rel, String title );
  
      public void onImage( String src, String width, String height );
  
  }
  
  
  
  1.1                  jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/EntityDecoder.java
  
  Index: EntityDecoder.java
  ===================================================================
  /*
   * Copyright 1999,2004 The Apache Software Foundation.
   * 
   * Licensed under the Apache License, Version 2.0 (the "License");
   * you may not use this file except in compliance with the License.
   * You may obtain a copy of the License at
   * 
   *      http://www.apache.org/licenses/LICENSE-2.0
   * 
   * Unless required by applicable law or agreed to in writing, software
   * distributed under the License is distributed on an "AS IS" BASIS,
   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   * See the License for the specific language governing permissions and
   * limitations under the License.
   */
  
  package org.apache.commons.feedparser.locate;
  
  import java.io.*;
  import java.net.*;
  import java.util.*;
  import java.util.regex.*;
  
  /**
   *
   * Given a piece of HTML we will decode the entities it contains.  This is a
   * trivial implementation and we need to go through and make sure all HTML
   * entities are escaped correctly.
   * 
   * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
   * @version $Id: EntityDecoder.java,v 1.1 2004/04/15 00:58:44 burton Exp $
   */
  public class EntityDecoder {
  
      private static HashMap entities = new HashMap();
  
      static Pattern pattern = Pattern.compile( "&([a-z]+);" );
      
      static {
  
          //FIXME: there are a LOT more of these and we need an exhaustive colleciton.
          
          entities.put( "gt", ">" );
          entities.put( "apos", ">" );
          entities.put( "lt", "<" );
          entities.put( "amp", "&" );
          entities.put( "raquo", "" );
          entities.put( "laquo", "" );
          
      }
  
      public static String decode( String content ) {
  
          //FIXME(performance): do I have existing code that does this more efficiently?
  
          StringBuffer buff = new StringBuffer( content.length() );
  
          Matcher m = pattern.matcher( content );
          
          int index = 0;
          while ( m.find() ) {
  
              //figure out which entity to escape or just include it.
  
              buff.append( content.substring( index, m.start( 0 ) ) );
  
              String entity = m.group( 1 );
  
              if ( entities.containsKey( entity ) ) {
                  buff.append( entities.get( entity ) );
              } else {
                  //found an entity we no NOTHING about.  Should we warn?
                  
                  buff.append( m.group( 0 ) );
              }
  
              index = m.end( 0 );
  
          }
  
          buff.append( content.substring( index, content.length() ) );
  
          return buff.toString();
          
      }
  
      public static void main( String[] args ) throws Exception {
  
          System.out.println( decode( "&amp;" ) );
          System.out.println( decode( "asdf&amp;asdf" ) );
  
          System.out.println( decode( "asdf&amp;" ) );
  
          System.out.println( decode( "&amp;asdf" ) );
  
      }
  
  }
  
  
  

---------------------------------------------------------------------
To unsubscribe, e-mail: commons-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: commons-dev-help@jakarta.apache.org