You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@commons.apache.org by bu...@apache.org on 2004/04/15 02:58:44 UTC
cvs commit: jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate AnchorParser.java AnchorParserListener.java EntityDecoder.java DiscoveryLocator.java FeedLocator.java FeedReference.java
burton 2004/04/14 17:58:44
Modified: feedparser TODO
feedparser/src/java/org/apache/commons/feedparser
FeedParser.java LinkFeedParserListener.java
MetaFeedParserListener.java NS.java
RSSFeedParser.java
feedparser/src/java/org/apache/commons/feedparser/locate
DiscoveryLocator.java FeedLocator.java
FeedReference.java
Added: feedparser/src/java/org/apache/commons/feedparser/locate
AnchorParser.java AnchorParserListener.java
EntityDecoder.java
Log:
init support for RSS location
Revision Changes Path
1.7 +5 -1 jakarta-commons-sandbox/feedparser/TODO
Index: TODO
===================================================================
RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/TODO,v
retrieving revision 1.6
retrieving revision 1.7
diff -u -r1.6 -r1.7
--- TODO 2 Mar 2004 08:27:52 -0000 1.6
+++ TODO 15 Apr 2004 00:58:43 -0000 1.7
@@ -42,4 +42,8 @@
- What are my options for XPATH libraries... I should benchmark these.. Also
see if using SAX will just be faster.
-- Full documentation on how we have to handle dates.
\ No newline at end of file
+- Full documentation on how we have to handle dates.
+
+- Support textinput
+
+- All FeedParser exceptions should include the URL of the feed if possible.
1.4 +16 -18 jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/FeedParser.java
Index: FeedParser.java
===================================================================
RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/FeedParser.java,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -r1.3 -r1.4
--- FeedParser.java 13 Mar 2004 07:05:57 -0000 1.3
+++ FeedParser.java 15 Apr 2004 00:58:44 -0000 1.4
@@ -42,27 +42,15 @@
public class FeedParser {
/**
- *
- * The mediaType can be one of the following:
- *
- * <li>application/rss+xml</li>
- * <li>application/atom+xml</li>
- *
- * @author <a href="mailto:burton@peerfear.org">Kevin Burton</a>
- */
- public static void parse( FeedParserListener listener,
- InputStream is,
- String mediaType) throws FeedParserException {
-
- }
-
- /**
* Parse this feed.
+ *
+ * @param resource The URL of the feed being parsed. This is optional and
+ * may be null but is used when an exception is thrown to aid debugging.
*
- * @author <a href="mailto:burton@peerfear.org">Kevin A. Burton</a>
*/
public static void parse( FeedParserListener listener,
- InputStream is ) throws FeedParserException {
+ InputStream is ,
+ String resource ) throws FeedParserException {
try {
@@ -76,6 +64,16 @@
//if an explicit FeedParserException is thrown just rethrow it..
throw fpe;
} catch ( Throwable t ) { throw new FeedParserException( t ); }
+
+ }
+
+ /**
+ * @deprecated Use #parse( FeedParserException, InputStream, String )
+ */
+ public static void parse( FeedParserListener listener,
+ InputStream is ) throws FeedParserException {
+
+ parse( listener, is, null );
}
1.3 +5 -3 jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/LinkFeedParserListener.java
Index: LinkFeedParserListener.java
===================================================================
RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/LinkFeedParserListener.java,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- LinkFeedParserListener.java 28 Feb 2004 03:35:22 -0000 1.2
+++ LinkFeedParserListener.java 15 Apr 2004 00:58:44 -0000 1.3
@@ -19,7 +19,7 @@
import org.jdom.*;
/**
- * Atom link and RSS 1.0 mod_link support.
+ * Atom link, RSS 2.0 enclosure, and RSS 1.0 mod_link support.
*
* @author <a href="mailto:burton@apache.org">Kevin A. Burton (burtonator)</a>
* @version $Id$
@@ -28,12 +28,14 @@
/**
*
+ * @param length The length of the content or -1 if not specified.
*/
public void onLink( FeedParserState state,
String rel,
String type,
String href,
- String title ) throws FeedParserException;
+ String title,
+ long length ) throws FeedParserException;
}
1.5 +5 -1 jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/MetaFeedParserListener.java
Index: MetaFeedParserListener.java
===================================================================
RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/MetaFeedParserListener.java,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -r1.4 -r1.5
--- MetaFeedParserListener.java 13 Mar 2004 07:05:57 -0000 1.4
+++ MetaFeedParserListener.java 15 Apr 2004 00:58:44 -0000 1.5
@@ -83,7 +83,11 @@
public void onIssued( FeedParserState state, String content ) throws FeedParserException;
public void onIssuedEnd() throws FeedParserException;
+ /**
+ * RSS 2.0 category. Dublin Core.
+ */
public void onSubject( FeedParserState state, String content ) throws FeedParserException;
+
public void onSubjectEnd() throws FeedParserException;
}
1.3 +1 -13 jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/NS.java
Index: NS.java
===================================================================
RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/NS.java,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- NS.java 28 Feb 2004 03:35:22 -0000 1.2
+++ NS.java 15 Apr 2004 00:58:44 -0000 1.3
@@ -54,23 +54,11 @@
public static final Namespace DCTERMS =
Namespace.getNamespace( "dcterms", "http://purl.org/dc/terms/" );
- public static final Namespace MARKED =
- Namespace.getNamespace( "marked", "http://newsmonster.org/schemas/marked/" );
-
public static final Namespace SUBSCRIPTION =
Namespace.getNamespace( "sub", "http://purl.org/rss/1.0/modules/subscription/" );
public static final Namespace NC =
Namespace.getNamespace( "NC", "http://home.netscape.com/NC-rdf#" );
-
- public static final Namespace NM =
- Namespace.getNamespace( "nm", "http://newsmonster.org/nm-rdf#" );
-
- public static final Namespace BLOGMETRIC =
- Namespace.getNamespace( "blogmetric", "http://newsmonster.org/src/1.0/blogmetric/" );
-
- public static final Namespace IMPLICIT =
- Namespace.getNamespace( "implicit", "http://newsmonster.org/scf/1.0/modules/implicit/" );
public static final Namespace XHTML =
Namespace.getNamespace( "xhtml", "http://www.w3.org/1999/xhtml" );
1.4 +4 -1 jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/RSSFeedParser.java
Index: RSSFeedParser.java
===================================================================
RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/RSSFeedParser.java,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -r1.3 -r1.4
--- RSSFeedParser.java 28 Feb 2004 03:35:22 -0000 1.3
+++ RSSFeedParser.java 15 Apr 2004 00:58:44 -0000 1.4
@@ -148,6 +148,9 @@
xpath.addNamespace( NS.RDF.getPrefix(), NS.RDF.getURI() );
Object node = xpath.selectSingleNode( state.current );
+ //FIXME: if this is a GUID and isPermalink=false don't use it as the
+ //parmalink.
+
if ( node instanceof Element ) {
resource = ((Element)node).getText();
} else if ( node instanceof Attribute ) {
1.3 +1 -0 jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/DiscoveryLocator.java
Index: DiscoveryLocator.java
===================================================================
RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/DiscoveryLocator.java,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- DiscoveryLocator.java 28 Feb 2004 03:35:22 -0000 1.2
+++ DiscoveryLocator.java 15 Apr 2004 00:58:44 -0000 1.3
@@ -19,6 +19,7 @@
import java.io.*;
import java.util.*;
+//FIXME: do NOT use apache regex as it has major problems.
import org.apache.regexp.*;
/**
1.3 +5 -0 jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/FeedLocator.java
Index: FeedLocator.java
===================================================================
RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/FeedLocator.java,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- FeedLocator.java 28 Feb 2004 03:35:22 -0000 1.2
+++ FeedLocator.java 15 Apr 2004 00:58:44 -0000 1.3
@@ -33,6 +33,7 @@
*
* Example: http://peerfear.org
*
+ * @param resource The weblog we need to discover
* @author <a href="mailto:burton@peerfear.org">Kevin A. Burton</a>
*/
public static final List locate( String resource ) throws Exception {
@@ -59,6 +60,10 @@
LinkedList list = new LinkedList();
DiscoveryLocator.locate( resource, content, list );
+
+ //FIXME: if we faile to locate with location with link discovery.
+
+ //FIXME: if we still fail try location link probing /index.rdf, /index.xml
return list;
1.3 +0 -2 jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/FeedReference.java
Index: FeedReference.java
===================================================================
RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/FeedReference.java,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- FeedReference.java 28 Feb 2004 03:35:22 -0000 1.2
+++ FeedReference.java 15 Apr 2004 00:58:44 -0000 1.3
@@ -16,8 +16,6 @@
package org.apache.commons.feedparser.locate;
-import org.peerfear.newsmonster.network.*;
-
import java.io.*;
import java.util.*;
1.1 jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/AnchorParser.java
Index: AnchorParser.java
===================================================================
/*
* Copyright 1999,2004 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.feedparser.locate;
import java.io.*;
import java.net.*;
import java.util.*;
import java.util.regex.*;
/**
*
* Given HTML pull out an array of anchors
*
* @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
*/
public class AnchorParser {
public static final String LINK_REGEXP = "<a [^>]*href=[\"']?([^\">']+)[\"']?[^>]*>([^<]+)";
static Pattern pattern = Pattern.compile( LINK_REGEXP,
Pattern.CASE_INSENSITIVE | Pattern.MULTILINE );
/**
* Get links from the given html with included titles and other metainfo.
*
* @deprecated use HTParser
* @author <a href="mailto:burton@peerfear.org">Kevin A. Burton</a>
*/
public static void parseAnchors( String content, AnchorParserListener listener ) {
int index = 0;
//FIXME: what if href isn't the first attribute? It will fail here...
Matcher m = pattern.matcher( content );
LinkedList list = new LinkedList();
while ( m.find() ) {
//expand this link
String resource = EntityDecoder.decode( m.group( 1 ) );
String title = EntityDecoder.decode( m.group( 2 ).trim() );
listener.onAnchor( resource, null, title );
}
}
}
1.1 jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/AnchorParserListener.java
Index: AnchorParserListener.java
===================================================================
/*
* Copyright 1999,2004 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.feedparser.locate;
import java.io.*;
import java.net.*;
import java.util.*;
import java.util.regex.*;
/**
*
* Given HTML pull out an array of anchors
*
* @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
*/
public interface AnchorParserListener {
public void setContext( Object context );
public void onAnchor( String href, String rel, String title );
public void onImage( String src, String width, String height );
}
1.1 jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/EntityDecoder.java
Index: EntityDecoder.java
===================================================================
/*
* Copyright 1999,2004 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.feedparser.locate;
import java.io.*;
import java.net.*;
import java.util.*;
import java.util.regex.*;
/**
*
* Given a piece of HTML we will decode the entities it contains. This is a
* trivial implementation and we need to go through and make sure all HTML
* entities are escaped correctly.
*
* @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
* @version $Id: EntityDecoder.java,v 1.1 2004/04/15 00:58:44 burton Exp $
*/
public class EntityDecoder {
private static HashMap entities = new HashMap();
static Pattern pattern = Pattern.compile( "&([a-z]+);" );
static {
//FIXME: there are a LOT more of these and we need an exhaustive colleciton.
entities.put( "gt", ">" );
entities.put( "apos", ">" );
entities.put( "lt", "<" );
entities.put( "amp", "&" );
entities.put( "raquo", "" );
entities.put( "laquo", "" );
}
public static String decode( String content ) {
//FIXME(performance): do I have existing code that does this more efficiently?
StringBuffer buff = new StringBuffer( content.length() );
Matcher m = pattern.matcher( content );
int index = 0;
while ( m.find() ) {
//figure out which entity to escape or just include it.
buff.append( content.substring( index, m.start( 0 ) ) );
String entity = m.group( 1 );
if ( entities.containsKey( entity ) ) {
buff.append( entities.get( entity ) );
} else {
//found an entity we no NOTHING about. Should we warn?
buff.append( m.group( 0 ) );
}
index = m.end( 0 );
}
buff.append( content.substring( index, content.length() ) );
return buff.toString();
}
public static void main( String[] args ) throws Exception {
System.out.println( decode( "&" ) );
System.out.println( decode( "asdf&asdf" ) );
System.out.println( decode( "asdf&" ) );
System.out.println( decode( "&asdf" ) );
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: commons-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: commons-dev-help@jakarta.apache.org