You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@commons.apache.org by bu...@apache.org on 2004/08/03 03:24:17 UTC
cvs commit: jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/tools XMLCleanser.java XMLEncodingParser.java
burton 2004/08/02 18:24:17
Modified: feedparser/src/java/org/apache/commons/feedparser
FeedParser.java
Added: feedparser/src/java/org/apache/commons/feedparser/tools
XMLCleanser.java XMLEncodingParser.java
Log:
Fixed BAD bug in the FeedParser with UTF-8 encoding of content due to interrnal bug in JDOM and the JDK...
Revision Changes Path
1.5 +48 -2 jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/FeedParser.java
Index: FeedParser.java
===================================================================
RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/FeedParser.java,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -r1.4 -r1.5
--- FeedParser.java 15 Apr 2004 00:58:44 -0000 1.4
+++ FeedParser.java 3 Aug 2004 01:24:17 -0000 1.5
@@ -16,6 +16,8 @@
package org.apache.commons.feedparser;
+import org.apache.commons.feedparser.tools.*;
+
import java.io.*;
import java.net.*;
import java.util.*;
@@ -53,7 +55,26 @@
String resource ) throws FeedParserException {
try {
-
+
+ //Need to massage our XML support forfor UTF-8 to prevent
+
+ byte[] bytes = toByteArray( is );
+ String encoding = XMLEncodingParser.parse( bytes );
+
+ if ( encoding == null )
+ encoding = "UTF-8";
+
+ if ( encoding.equalsIgnoreCase( "UTF-8" ) ) {
+
+ String result = XMLCleanser.cleanse( bytes, encoding );
+ is = new ByteArrayInputStream( result.getBytes() );
+
+ } else {
+
+ //use the original bytes to build the DOM
+ is = new ByteArrayInputStream( bytes );
+ }
+
DOMBuilder builder = new DOMBuilder();
org.jdom.Document doc = builder.build( is );
@@ -105,6 +126,31 @@
//if an explicit FeedParserException is thrown just rethrow it..
throw fpe;
} catch ( Throwable t ) { throw new FeedParserException( t ); }
+
+ }
+
+ /**
+ * Convert an InputStream to a byte array.
+ */
+ public static byte[] toByteArray( InputStream is ) throws IOException {
+
+ //WARNING:
+ ByteArrayOutputStream bos = new ByteArrayOutputStream();
+
+ //now process the Reader...
+ byte data[] = new byte[200];
+
+ int readCount = 0;
+
+ while( ( readCount = is.read( data )) > 0 ) {
+
+ bos.write( data, 0, readCount );
+ }
+
+ is.close();
+ bos.close();
+
+ return bos.toByteArray();
}
1.1 jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/tools/XMLCleanser.java
Index: XMLCleanser.java
===================================================================
/*
* Copyright 1999,2004 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.feedparser.tools;
/**
* Class that can cleanse a string so that nothing can be present to break an
* XML parser. This is a VERY non-portable class as it is meant to work just
* with Xalan/Xerces and may remove more text and replace things that are
* non-XML centric.
*
* @author <a href="mailto:burton@peerfear.org">Kevin A. Burton</a>
* @version $Id: XMLCleanser.java,v 1.1 2004/08/03 01:24:17 burton Exp $
*/
public class XMLCleanser {
public static String cleanse( String content ) {
StringBuffer buff = new StringBuffer( content.length() );
for ( int i = 0; i < content.length(); ++i ) {
char c = content.charAt( i );
if ( isXMLCharacter( c ) ) {
buff.append( c );
}
}
return buff.toString();
}
/**
* Copy based on a byte array.
*
* @author <a href="mailto:burton@peerfear.org">Kevin A. Burton</a>
*/
public static String cleanse( byte[] content, String encoding ) throws Exception {
String s = new String( content, encoding);
StringBuffer buff = new StringBuffer( content.length );
for ( int i = 0; i < s.length(); ++i ) {
char c = s.charAt( i );
if ( isXMLCharacter( c ) ) {
buff.append( c );
}
}
return buff.toString();
}
public static char[] cleanseToCharArray( byte[] content ) {
char[] buff = new char[content.length];
int index = 0;
for ( int i = 0; i < content.length; ++i ) {
char c = (char)content[ i ];
if ( isXMLCharacter( c ) ) {
buff[index] = c;
++index;
}
}
return buff;
}
/**
* Copy based on a byte array.
*
* @author <a href="mailto:burton@peerfear.org">Kevin A. Burton</a>
*/
public static byte[] cleanseToByteArray( byte[] content ) {
byte[] buff = new byte[ content.length ];
int index = 0;
for ( int i = 0; i < content.length; ++i ) {
char c = (char)content[ i ];
if ( isXMLCharacter( c ) ) {
//buff.append( c );
buff[index] = content[ i ];
++index;
}
}
return buff;
}
/*
* This is a utility function for determining whether a specified
* character is a character according to production 2 of the
* XML 1.0 specification.
*
* @param c <code>char</code> to check for XML compliance.
* @return <code>boolean</code> - true if it's a character,
* false otherwise.
*/
public static boolean isXMLCharacter(char c) {
if (c == '\n') return true;
if (c == '\r') return true;
if (c == '\t') return true;
if (c < 0x20) return false; if (c < 0x80) return true;
if (c < 0xFF) return false; if (c <= 0xD7FF) return true;
if (c < 0xE000) return false; if (c <= 0xFFFD) return true;
if (c < 0x10000) return false; if (c <= 0x10FFFF) return true;
return false;
}
}
1.1 jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/tools/XMLEncodingParser.java
Index: XMLEncodingParser.java
===================================================================
/*
* Copyright 1999,2004 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.feedparser.tools;
import java.io.*;
import java.net.*;
import java.util.*;
/**
*
* Given an XML document pull out the encoding or null if not specified.
*
* @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
*/
public class XMLEncodingParser {
public static final String ENCODING = "encoding=\"";
/**
*
* @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
*/
public static String parse( byte[] content ) throws Exception {
//this isn't really pretty but it is fast.
//just use the first 100 bytes
String str;
if ( content.length > 100 ) {
str = new String( content, 0, 100 );
} else {
str = new String( content );
}
int end = str.indexOf( ">" );
if ( end == -1 )
return null;
String decl = str.substring( 0, end );
int index = decl.indexOf( ENCODING );
if ( index != -1 ) {
String encoding = decl.substring( index + ENCODING.length(),
decl.length() );
end = encoding.indexOf( "\"" );
if ( end == -1 )
return null;
encoding = encoding.substring( 0, end);
return encoding;
}
return null;
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: commons-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: commons-dev-help@jakarta.apache.org