You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@commons.apache.org by bu...@apache.org on 2005/01/18 20:39:36 UTC
cvs commit: jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate AnchorParser.java DiscoveryLocator.java EntityDecoder.java FeedLocator.java ProbeLocator.java
burton 2005/01/18 11:39:36
Modified: feedparser build.xml
feedparser/src/java/org/apache/commons/feedparser/locate
AnchorParser.java DiscoveryLocator.java
EntityDecoder.java FeedLocator.java
ProbeLocator.java
Log:
Fixed major bug in our anchor parser that would actually cause the page stop being parsed....
Revision Changes Path
1.11 +4 -0 jakarta-commons-sandbox/feedparser/build.xml
Index: build.xml
===================================================================
RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/build.xml,v
retrieving revision 1.10
retrieving revision 1.11
diff -u -r1.10 -r1.11
--- build.xml 22 Oct 2004 00:37:08 -0000 1.10
+++ build.xml 18 Jan 2005 19:39:36 -0000 1.11
@@ -119,7 +119,9 @@
classname="org.apache.commons.feedparser.locate.TestFeedLocator"
fork="true"
failonerror="true">
+
<sysproperty key="feedparser.home" value="${feedparser.home}"/>
+
</java>
</target>
@@ -131,7 +133,9 @@
classname="org.apache.commons.feedparser.test.TestProbeLocator"
fork="true"
failonerror="true">
+
<sysproperty key="feedparser.home" value="${feedparser.home}"/>
+
</java>
</target>
1.6 +50 -8 jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/AnchorParser.java
Index: AnchorParser.java
===================================================================
RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/AnchorParser.java,v
retrieving revision 1.5
retrieving revision 1.6
diff -u -r1.5 -r1.6
--- AnchorParser.java 29 Dec 2004 02:18:21 -0000 1.5
+++ AnchorParser.java 18 Jan 2005 19:39:36 -0000 1.6
@@ -22,7 +22,8 @@
/**
*
- * Given HTML pull out an array of anchors
+ * Given a string of HTML content, parse out anchors and fire events with all
+ * the data when they are found.
*
* @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
*/
@@ -51,26 +52,67 @@
int index = 0;
- //FIXME: what if href isn't the first attribute? It will fail here...
+ //FIXME: what if href isn't the first attribute? Will it fail here???
+
+ //FIXME: how do we pass back the content of the href: <a href=''> this
+ //is the content </a> which would pass a string "this is the content"
Matcher m = pattern.matcher( content );
- LinkedList list = new LinkedList();
-
while ( m.find() ) {
- //expand this link
+ HashMap map = DiscoveryLocator.getAttributes( m.group( 0 ) );
String resource = EntityDecoder.decode( m.group( 1 ) );
- String title = EntityDecoder.decode( m.group( 2 ).trim() );
+ //String title = EntityDecoder.decode( m.group( 2 ).trim() );
+ String title = (String)map.get( "title" );
+
+ if ( title != null ) {
+ title = title.trim();
+ title = EntityDecoder.decode( title );
+ }
+
+ String rel = (String)map.get( "rel" );
+
if ( resource == null || resource.equals( "" ) )
- return;
+ continue;
- if ( ! listener.onAnchor( resource, null, title ) )
+ if ( ! listener.onAnchor( resource, rel, title ) )
return;
}
+
+ }
+
+ public static void main( String[] args ) throws Exception {
+
+ AnchorParserListener listener = new AnchorParserListener() {
+
+ public boolean onAnchor( String href, String rel, String title ) {
+
+ System.out.println( "href: " + href );
+ System.out.println( "rel: " + rel );
+ System.out.println( "title: " + title );
+ return true;
+ }
+
+ public Object getResult() {
+ return null;
+ }
+ public void setContext( Object context ) {}
+
+ };
+
+ //FIXME: won't work with single quotes
+ //FIXME: won't work with <a />
+ //parse( "<a href=\"http://peerfear.org\" rel=\"linux\" title=\"linux\" >adf</a>", listener );
+
+ //parse( "<a rel=\"linux\" href=\"http://peerfear.org\" title=\"linux\" >adf</a>", listener );
+ //parse( "<a title=\"linux\" rel=\"linux\" href=\"http://peerfear.org\" >adf</a>", listener );
+
+ parse( "<a href='http://peerfear.org' rel='linux' title='linux' >adf</a>", listener );
+
}
1.14 +2 -2 jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/DiscoveryLocator.java
Index: DiscoveryLocator.java
===================================================================
RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/DiscoveryLocator.java,v
retrieving revision 1.13
retrieving revision 1.14
diff -u -r1.13 -r1.14
--- DiscoveryLocator.java 20 Aug 2004 21:44:06 -0000 1.13
+++ DiscoveryLocator.java 18 Jan 2005 19:39:36 -0000 1.14
@@ -134,11 +134,11 @@
*
* @author <a href="mailto:burton@rojo.com">Kevin A. Burton</a>
*/
- public static HashMap getAttributes( String link ) {
+ public static HashMap getAttributes( String content ) {
HashMap map = new HashMap();
- Matcher m = attr_pattern.matcher( link );
+ Matcher m = attr_pattern.matcher( content );
int index = 0;
1.4 +5 -4 jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/EntityDecoder.java
Index: EntityDecoder.java
===================================================================
RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/EntityDecoder.java,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -r1.3 -r1.4
--- EntityDecoder.java 29 Dec 2004 02:18:21 -0000 1.3
+++ EntityDecoder.java 18 Jan 2005 19:39:36 -0000 1.4
@@ -23,9 +23,10 @@
/**
*
- * Given a piece of HTML we will decode the entities it contains. This is a
- * trivial implementation and we need to go through and make sure all HTML
- * entities are escaped correctly.
+ * Given a string of HTML content we decode the entities it contains.
+ *
+ * NOTE: Currently this is a trivial implementation and we need to go through
+ * and make sure all HTML entities are correctly supported.
*
* @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
* @version $Id$
1.25 +0 -2 jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/FeedLocator.java
Index: FeedLocator.java
===================================================================
RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/FeedLocator.java,v
retrieving revision 1.24
retrieving revision 1.25
diff -u -r1.24 -r1.25
--- FeedLocator.java 22 Oct 2004 00:37:08 -0000 1.24
+++ FeedLocator.java 18 Jan 2005 19:39:36 -0000 1.25
@@ -104,13 +104,11 @@
//String resource = "file:///projects/feedparser/tests/locate5.html";
//String resource = "file:///projects/feedparser/tests/locate6.html";
-
//FIXME: add UNIT TESTS for Yahoo Groups and Flickr
String resource = "http://groups.yahoo.com/group/aggregators/";
//String resource = "http://flickr.com/photos/tags/cats";
-
//String resource = "file:///projects/feedparser/tests/locate8.html";
//String resource = "http://blogs.sun.com/roller/page/gonzo";
1.17 +2 -2 jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/ProbeLocator.java
Index: ProbeLocator.java
===================================================================
RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/ProbeLocator.java,v
retrieving revision 1.16
retrieving revision 1.17
diff -u -r1.16 -r1.17
--- ProbeLocator.java 22 Oct 2004 00:37:08 -0000 1.16
+++ ProbeLocator.java 18 Jan 2005 19:39:36 -0000 1.17
@@ -79,9 +79,9 @@
// fail-fast if we already have some results and if we determine that
// we can trust the results (TextAmerica has invalid autodiscovery,
// for example)
- if ( list.size() > 0 && blogService.hasValidAutoDiscovery() )
+ if ( list.size() > 0 && blogService.hasValidAutodiscovery() )
return list;
- else if ( blogService.hasValidAutoDiscovery() == false ) {
+ else if ( blogService.hasValidAutodiscovery() == false ) {
// clear out the list so far since we can't trust the results
list.clear();
}
---------------------------------------------------------------------
To unsubscribe, e-mail: commons-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: commons-dev-help@jakarta.apache.org