You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@commons.apache.org by bu...@apache.org on 2004/09/03 21:46:47 UTC
cvs commit: jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/tools XMLCleanser.java XMLEncodingParser.java
burton 2004/09/03 12:46:47
Modified: feedparser TODO build.xml
feedparser/src/java/org/apache/commons/feedparser
FeedFilter.java FeedParser.java Main.java
RSSFeedParser.java
feedparser/src/java/org/apache/commons/feedparser/locate
AnchorParser.java ResourceExpander.java
feedparser/src/java/org/apache/commons/feedparser/test
TestFeedFilter.java TestFeedParser.java
feedparser/src/java/org/apache/commons/feedparser/tools
XMLCleanser.java XMLEncodingParser.java
Log:
don't use links if they are null
Revision Changes Path
1.12 +1 -2 jakarta-commons-sandbox/feedparser/TODO
Index: TODO
===================================================================
RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/TODO,v
retrieving revision 1.11
retrieving revision 1.12
diff -u -r1.11 -r1.12
--- TODO 31 Aug 2004 21:01:37 -0000 1.11
+++ TODO 3 Sep 2004 19:46:47 -0000 1.12
@@ -1,4 +1,5 @@
+- BUG: what happens when I put a comment after a UTF-16 BOM?!
- Support Base64 Atom values and the ability to enable them.
@@ -6,9 +7,7 @@
- Do we support multiple content items in Atom?
-
- We do not support multipart/alternative in the feedparser.
-
- Do we support atom:summary at ALL?! I don't think so...
1.7 +1 -0 jakarta-commons-sandbox/feedparser/build.xml
Index: build.xml
===================================================================
RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/build.xml,v
retrieving revision 1.6
retrieving revision 1.7
diff -u -r1.6 -r1.7
--- build.xml 2 Sep 2004 00:36:25 -0000 1.6
+++ build.xml 3 Sep 2004 19:46:47 -0000 1.7
@@ -121,6 +121,7 @@
<formatter type="plain" usefile="false"/>
+ <test name="org.apache.commons.feedparser.test.TestFeedFilter"/>
<test name="org.apache.commons.feedparser.test.TestProbeLocator"/>
<test name="org.apache.commons.feedparser.test.TestAtom"/>
<test name="org.apache.commons.feedparser.test.TestFeedParserUTF8"/>
1.4 +37 -12 jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/FeedFilter.java
Index: FeedFilter.java
===================================================================
RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/FeedFilter.java,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -r1.3 -r1.4
--- FeedFilter.java 2 Sep 2004 01:19:55 -0000 1.3
+++ FeedFilter.java 3 Sep 2004 19:46:47 -0000 1.4
@@ -32,40 +32,65 @@
private static Pattern entity_pattern = Pattern.compile( "&([a-zA-Z]+);" );
- public static byte[] parse( byte[] bytes ) {
+ public static byte[] parse( byte[] bytes )
+ throws Exception {
- String content = new String( bytes );
+ return parse( bytes, "UTF-8" );
+
+ }
+
+ public static byte[] parse( byte[] bytes, String encoding )
+ throws Exception {
+
+ String content = new String( bytes, encoding );
+
+ return parse( content, encoding );
+
+ }
+
+ public static byte[] parse( String content, String encoding )
+ throws Exception {
//remove leading prolog...
- content = doRemoveLeadingProlog( content );
+ content = doRemoveLeadingProlog( content, encoding );
content = doDecodeEntities( content );
- return content.getBytes();
-
- }
+ return content.getBytes( encoding );
+ }
+
/**
* Removing prolog whitespace, comments, and other garbage from the
* beginning of a feed.
*
* @author <a href="mailto:burton@rojo.com">Kevin A. Burton</a>
*/
- private static String doRemoveLeadingProlog( String content ) {
+ private static String doRemoveLeadingProlog( String content, String encoding ) {
+
+ //if we're a UTF-16 or UTF-32 feed we need to LEAVE the prolog because
+ //it triggers a UTF-16 parse.
+ if ( "UTF-16".equals( encoding ) ||
+ "UTF-32".equals( encoding ) )
+ return content;
+
//move to the beginning of the first element or comment. When this is a
//processing instruction we will move to that
int begin = content.indexOf( "<" );
- if ( begin > 0 )
+ if ( begin > 0 ) {
content = content.substring( begin, content.length() );
+ }
- //now skip to the XML processing instruction when necessary.
+ //now skip to the XML processing instruction when necessary. This is
+ //used to remove comments prior to <?xml which are not allowed.
begin = content.indexOf( "<?xml" );
- if ( begin > 0 )
+ if ( begin > 0 ) {
content = content.substring( begin, content.length() );
+ }
return content;
@@ -107,7 +132,7 @@
}
- public static void main( String[] args ) {
+ public static void main( String[] args ) throws Exception {
byte[] b = parse( "hello é world".getBytes() );
1.10 +47 -23 jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/FeedParser.java
Index: FeedParser.java
===================================================================
RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/FeedParser.java,v
retrieving revision 1.9
retrieving revision 1.10
diff -u -r1.9 -r1.10
--- FeedParser.java 31 Aug 2004 21:00:32 -0000 1.9
+++ FeedParser.java 3 Sep 2004 19:46:47 -0000 1.10
@@ -32,6 +32,8 @@
import org.jaxen.jdom.*;
+import org.apache.log4j.Logger;
+
/**
* This FeedParser implementation is based on JDOM and Jaxen and is based around
* XPath and JDOM iteration. While the implementation is straight forward it
@@ -43,6 +45,8 @@
*/
public class FeedParser {
+ private static Logger log = Logger.getLogger( FeedParser.class );
+
/**
* Parse this feed.
*
@@ -56,6 +60,8 @@
try {
+ is = getCorrectInputStream( is );
+
// Need to massage our XML support forfor UTF-8 to prevent the
// dreaded "Invalid byte 1 of 1-byte UTF-8 sequence" content bug in
// some default feeds. This was tested a great deal under
@@ -64,31 +70,12 @@
// In FeedParser 2.0 (or as soon as we use SAX) this code should be
// totally removed to use the original stream.
- byte[] bytes = toByteArray( is );
- String encoding = XMLEncodingParser.parse( bytes );
-
- if ( encoding == null )
- encoding = "UTF-8";
-
- if ( encoding.equalsIgnoreCase( "UTF-8" ) ) {
-
- String result = XMLCleanser.cleanse( bytes, encoding );
- bytes = result.getBytes();
-
- }
-
- //remove prefix whitespace, intern HTML entities, etc.
- bytes = FeedFilter.parse( bytes );
-
- //build an input stream from the our bytes for parsing...
- is = new ByteArrayInputStream( bytes );
-
//OK. Now we have the right InputStream so we should build our DOM
//and exec.
DOMBuilder builder = new DOMBuilder();
-
+
org.jdom.Document doc = builder.build( is );
-
+
parse( listener, doc );
} catch ( FeedParserException fpe ) {
@@ -99,6 +86,43 @@
}
/**
+ * Perform the Xerces UTF8 correction and FeedFilter.
+ *
+ * @author <a href="mailto:burton@rojo.com">Kevin A. Burton</a>
+ */
+ private static InputStream getCorrectInputStream( InputStream is )
+ throws Exception {
+
+ byte[] bytes = toByteArray( is );
+
+ //FIXME: if we return the WRONG content type here we will royally fuck
+ //up getByets... UTF-16 and UTF-32 especially
+ String encoding = XMLEncodingParser.parse( bytes );
+
+ if ( encoding == null )
+ encoding = "UTF-8";
+
+ if ( encoding.startsWith( "UTF" ) ) {
+
+ String result = XMLCleanser.cleanse( bytes, encoding );
+ bytes = FeedFilter.parse( result, encoding );
+
+ } else {
+
+ bytes = FeedFilter.parse( bytes, encoding );
+
+ }
+
+ //remove prefix whitespace, intern HTML entities, etc.
+
+ //build an input stream from the our bytes for parsing...
+ is = new ByteArrayInputStream( bytes );
+
+ return is;
+
+ }
+
+ /**
* @deprecated Use #parse( FeedParserException, InputStream, String )
*/
public static void parse( FeedParserListener listener,
@@ -145,7 +169,7 @@
return;
}
- //fall back on RDF.
+ //fall back on RDF and RSS
RSSFeedParser.parse( listener, doc );
1.3 +3 -1 jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/Main.java
Index: Main.java
===================================================================
RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/Main.java,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- Main.java 21 Apr 2004 07:16:03 -0000 1.2
+++ Main.java 3 Sep 2004 19:46:47 -0000 1.3
@@ -51,6 +51,8 @@
if ( input.startsWith( "http://" ) ) {
is = new URL( input ).openStream();
} else {
+
+ System.out.println( "Opening from file: " + input );
is = new FileInputStream( input );
}
1.12 +3 -1 jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/RSSFeedParser.java
Index: RSSFeedParser.java
===================================================================
RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/RSSFeedParser.java,v
retrieving revision 1.11
retrieving revision 1.12
diff -u -r1.11 -r1.12
--- RSSFeedParser.java 2 Sep 2004 01:19:55 -0000 1.11
+++ RSSFeedParser.java 3 Sep 2004 19:46:47 -0000 1.12
@@ -245,6 +245,8 @@
public static String getChildElementTextByName( FeedParserState state,
String name ) throws Exception {
+ //FIXME: this can be rewritten to use getChild()
+
XPath xpath = new XPath( "descendant::*[local-name() = '" + name + "']" );
Object resultNode = xpath.selectSingleNode( state.current );
1.4 +4 -2 jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/AnchorParser.java
Index: AnchorParser.java
===================================================================
RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/AnchorParser.java,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -r1.3 -r1.4
--- AnchorParser.java 31 Aug 2004 21:00:32 -0000 1.3
+++ AnchorParser.java 3 Sep 2004 19:46:47 -0000 1.4
@@ -40,8 +40,7 @@
parseAnchors( content, listener );
}
-
-
+
/**
* Get links from the given html with included titles and other metainfo.
*
@@ -66,6 +65,9 @@
String resource = EntityDecoder.decode( m.group( 1 ) );
String title = EntityDecoder.decode( m.group( 2 ).trim() );
+ if ( resource == null || resource.equals( "" ) )
+ return;
+
if ( ! listener.onAnchor( resource, null, title ) )
return;
1.5 +4 -1 jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/ResourceExpander.java
Index: ResourceExpander.java
===================================================================
RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/locate/ResourceExpander.java,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -r1.4 -r1.5
--- ResourceExpander.java 26 Jun 2004 22:42:45 -0000 1.4
+++ ResourceExpander.java 3 Sep 2004 19:46:47 -0000 1.5
@@ -234,6 +234,9 @@
*/
public static String getBase( String resource ) {
+ if ( resource == null )
+ return null;
+
int begin = "http://".length() + 1;
int end = resource.lastIndexOf( "/" );
1.3 +27 -5 jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/test/TestFeedFilter.java
Index: TestFeedFilter.java
===================================================================
RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/test/TestFeedFilter.java,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- TestFeedFilter.java 2 Sep 2004 00:36:25 -0000 1.2
+++ TestFeedFilter.java 3 Sep 2004 19:46:47 -0000 1.3
@@ -45,6 +45,8 @@
*/
public class TestFeedFilter extends TestCase {
+ public static int current = 0;
+
public TestFeedFilter( String name ) throws Exception {
super( name );
@@ -52,20 +54,31 @@
private void doTest( String resource ) throws Exception {
- System.out.println( "resource: " + resource );
+ System.out.println( "resource: (" + current + ") " + resource );
URL url = new URL( resource );
- PrintStream out = new PrintStream( new ByteArrayOutputStream() );
+ FileOutputStream fos = new FileOutputStream( "/tmp/test-feed-filter-" + current + ".html" );
+ PrintStream out = new PrintStream( fos, true, "UTF-8" );
+
+ out.println( "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html; charset=UTF-8\"> " );
+ out.println( "<pre>" );
+
DebugFeedParserListener listener = new DebugFeedParserListener( out );
FeedParser.parse( listener, url.openStream(), resource );
-
+
+ out.println( "</pre>" );
+
+ ++current;
+
}
public void test1() throws Exception {
+ doTest( "file:tests/feeds/rss-1.0-EUC-JP.rdf" );
+
doTest( "file:tests/filter/nbsp-1.xml" );
doTest( "file:tests/filter/entity-atom-1.xml" );
@@ -73,7 +86,16 @@
doTest( "file:tests/filter/prolog-atom-1.xml" );
doTest( "file:tests/filter/prolog-atom-2.xml" );
doTest( "file:tests/filter/prolog-opml-1.xml" );
-
+
+ doTest( "file:tests/feeds/utf16.rss1" );
+ doTest( "file:tests/feeds/utf16.rss2" );
+ doTest( "file:tests/feeds/i18n.atom" );
+ doTest( "file:tests/feeds/utf16.atom" );
+
+ doTest( "file:tests/feeds/atom-1.xml" );
+ doTest( "file:tests/feeds/rss-1.0-EUC-JP.rdf" );
+ doTest( "file:tests/feeds/rss-1.0-international-1.rdf" );
+
}
public static void main( String[] args ) throws Exception {
1.3 +7 -7 jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/test/TestFeedParser.java
Index: TestFeedParser.java
===================================================================
RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/test/TestFeedParser.java,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- TestFeedParser.java 28 Feb 2004 03:35:22 -0000 1.2
+++ TestFeedParser.java 3 Sep 2004 19:46:47 -0000 1.3
@@ -72,13 +72,13 @@
public void finished() {}
};
-
- listener.setContext( this );
+
+ listener.setContext( this );
+
+ ResourceRequest request = ResourceRequestFactory.getResourceRequest( resource );
+
+ parser.parse( listener, request.getInputStream() );
- ResourceRequest request = ResourceRequestFactory.getResourceRequest( resource );
-
- parser.parse( listener, request.getInputStream() );
-
}
public static void main( String[] args ) {
1.2 +2 -2 jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/tools/XMLCleanser.java
Index: XMLCleanser.java
===================================================================
RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/tools/XMLCleanser.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- XMLCleanser.java 3 Aug 2004 01:24:17 -0000 1.1
+++ XMLCleanser.java 3 Sep 2004 19:46:47 -0000 1.2
@@ -54,7 +54,7 @@
*/
public static String cleanse( byte[] content, String encoding ) throws Exception {
- String s = new String( content, encoding);
+ String s = new String( content, encoding );
StringBuffer buff = new StringBuffer( content.length );
1.2 +65 -5 jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/tools/XMLEncodingParser.java
Index: XMLEncodingParser.java
===================================================================
RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/tools/XMLEncodingParser.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- XMLEncodingParser.java 3 Aug 2004 01:24:17 -0000 1.1
+++ XMLEncodingParser.java 3 Sep 2004 19:46:47 -0000 1.2
@@ -22,7 +22,8 @@
/**
*
- * Given an XML document pull out the encoding or null if not specified.
+ * Given an XML document pull out the encoding or the default (UTF-8) if not
+ * specified.
*
* @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
*/
@@ -41,17 +42,22 @@
//just use the first 100 bytes
String str;
-
+
if ( content.length > 100 ) {
str = new String( content, 0, 100 );
} else {
str = new String( content );
}
+ String result = getEncodingFromBOM( content );
+
+ if ( result != null )
+ return result;
+
int end = str.indexOf( ">" );
if ( end == -1 )
- return null;
+ return "UTF-8";
String decl = str.substring( 0, end );
@@ -65,16 +71,70 @@
end = encoding.indexOf( "\"" );
if ( end == -1 )
- return null;
+ return "UTF-8";
encoding = encoding.substring( 0, end);
+ encoding = encoding.toUpperCase();
+ if ( "UTF8".equals( encoding ) )
+ encoding = "UTF-8";
+
return encoding;
}
- return null;
+ return "UTF-8";
+
+ }
+
+ private static String getEncodingFromBOM( byte[] content ) {
+
+ // Technically speaking if we see a BOM is specified we're supposed to
+ // return UTF-16 or UTF-32 but because we only care about anything UTF
+ // returning UTF-8 is incorrect but acceptable.
+ //
+ // http://www.unicode.org/faq/utf_bom.html#BOM
+
+ if ( content.length > 2 ) {
+
+ //perform UTF-16 tests
+ if ( content[0] == -1 &&
+ content[1] == -2 )
+ return "UTF-16";
+
+ if ( content[0] == -2 &&
+ content[1] == -1 )
+ return "UTF-16";
+
+ }
+
+ if ( content.length > 4 ) {
+
+ //perform UTF-16 tests
+ if ( content[0] == 0 &&
+ content[1] == 0 &&
+ content[2] == -2 &&
+ content[3] == -1 )
+ return "UTF-32";
+
+ if ( content[0] == -1 &&
+ content[1] == -2 &&
+ content[2] == 0 &&
+ content[3] == 0 )
+ return "UTF-32";
+ }
+
+ return null;
+
}
+ public static void main( String[] args ) throws Exception {
+
+ System.out.println( parse( "<?xml encoding=\"utf-8\"?>".getBytes() ) );
+ System.out.println( parse( "<?xml encoding=\"UTF-8\"?>".getBytes() ) );
+ System.out.println( parse( "<?xml encoding=\"utf8\"?>".getBytes() ) );
+
+ }
+
}
---------------------------------------------------------------------
To unsubscribe, e-mail: commons-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: commons-dev-help@jakarta.apache.org