You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@commons.apache.org by bu...@apache.org on 2004/08/03 03:24:17 UTC
cvs commit: jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/tools XMLCleanser.java XMLEncodingParser.java

burton      2004/08/02 18:24:17

  Modified:    feedparser/src/java/org/apache/commons/feedparser
                        FeedParser.java
  Added:       feedparser/src/java/org/apache/commons/feedparser/tools
                        XMLCleanser.java XMLEncodingParser.java
  Log:
  Fixed BAD bug in the FeedParser with UTF-8 encoding of content due to interrnal bug in JDOM and the JDK...
  
  Revision  Changes    Path
  1.5       +48 -2     jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/FeedParser.java
  
  Index: FeedParser.java
  ===================================================================
  RCS file: /home/cvs/jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/FeedParser.java,v
  retrieving revision 1.4
  retrieving revision 1.5
  diff -u -r1.4 -r1.5
  --- FeedParser.java	15 Apr 2004 00:58:44 -0000	1.4
  +++ FeedParser.java	3 Aug 2004 01:24:17 -0000	1.5
  @@ -16,6 +16,8 @@
   
   package org.apache.commons.feedparser;
   
  +import org.apache.commons.feedparser.tools.*;
  +
   import java.io.*;
   import java.net.*;
   import java.util.*;
  @@ -53,7 +55,26 @@
                                 String resource ) throws FeedParserException {
   
           try { 
  -            
  +
  +            //Need to massage our XML support forfor UTF-8 to prevent 
  +
  +            byte[] bytes = toByteArray( is );
  +            String encoding = XMLEncodingParser.parse( bytes );
  +
  +            if ( encoding == null )
  +                encoding = "UTF-8";
  +
  +            if ( encoding.equalsIgnoreCase( "UTF-8" ) ) {
  +
  +                String result = XMLCleanser.cleanse( bytes, encoding );
  +                is = new ByteArrayInputStream( result.getBytes() );
  +                
  +            } else {
  +
  +                //use the original bytes to build the DOM
  +                is = new ByteArrayInputStream( bytes );
  +            }
  +
               DOMBuilder builder = new DOMBuilder();
               
               org.jdom.Document doc = builder.build( is );
  @@ -105,6 +126,31 @@
               //if an explicit FeedParserException is thrown just rethrow it..
               throw fpe;
           } catch ( Throwable t ) { throw new FeedParserException( t ); }
  +
  +    }
  +
  +    /**
  +     * Convert an InputStream to a byte array.
  +     */
  +    public static byte[] toByteArray( InputStream is ) throws IOException {
  +
  +        //WARNING: 
  +        ByteArrayOutputStream bos = new ByteArrayOutputStream();
  +      
  +        //now process the Reader...
  +        byte data[] = new byte[200];
  +    
  +        int readCount = 0;
  +
  +        while( ( readCount = is.read( data )) > 0 ) {
  +            
  +            bos.write( data, 0, readCount );
  +        }
  +
  +        is.close();
  +        bos.close();
  +
  +        return bos.toByteArray();
   
       }
   
  
  
  
  1.1                  jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/tools/XMLCleanser.java
  
  Index: XMLCleanser.java
  ===================================================================
  /*
   * Copyright 1999,2004 The Apache Software Foundation.
   * 
   * Licensed under the Apache License, Version 2.0 (the "License");
   * you may not use this file except in compliance with the License.
   * You may obtain a copy of the License at
   * 
   *      http://www.apache.org/licenses/LICENSE-2.0
   * 
   * Unless required by applicable law or agreed to in writing, software
   * distributed under the License is distributed on an "AS IS" BASIS,
   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   * See the License for the specific language governing permissions and
   * limitations under the License.
   */
  
  package org.apache.commons.feedparser.tools;
  
  /**
   * Class that can cleanse a string so that nothing can be present to break an
   * XML parser.  This is a VERY non-portable class as it is meant to work just
   * with Xalan/Xerces and may remove more text and replace things that are
   * non-XML centric.  
   *
   * @author <a href="mailto:burton@peerfear.org">Kevin A. Burton</a>
   * @version $Id: XMLCleanser.java,v 1.1 2004/08/03 01:24:17 burton Exp $
   */
  public class XMLCleanser {
  
      public static String cleanse( String content ) {
  
          StringBuffer buff = new StringBuffer( content.length() );
  
          for ( int i = 0; i < content.length(); ++i ) {
  
              char c = content.charAt( i );
              
              if ( isXMLCharacter( c ) ) {
  
                  buff.append( c );
                  
              } 
  
          }
  
          return buff.toString();
  
      }
  
      /**
       * Copy based on a byte array.  
       *
       * @author <a href="mailto:burton@peerfear.org">Kevin A. Burton</a>
       */
      public static String cleanse( byte[] content, String encoding ) throws Exception {
  
          String s = new String( content, encoding);
          
          StringBuffer buff = new StringBuffer( content.length );
  
          for ( int i = 0; i < s.length(); ++i ) {
  
              char c = s.charAt( i );
              
              if ( isXMLCharacter( c ) ) {
  
                  buff.append( c );
                  
              } 
  
          }
  
          return buff.toString();
  
      }
  
      public static char[] cleanseToCharArray( byte[] content ) {
  
          char[] buff = new char[content.length];
  
          int index = 0;
  
          for ( int i = 0; i < content.length; ++i ) {
  
              char c = (char)content[ i ];
              
              if ( isXMLCharacter( c ) ) {
  
                  buff[index] = c;
                  
                  ++index;
              } 
  
          }
  
          return buff;
  
      }
      
      /**
       * Copy based on a byte array.  
       *
       * @author <a href="mailto:burton@peerfear.org">Kevin A. Burton</a>
       */
      public static byte[] cleanseToByteArray( byte[] content ) {
  
          byte[] buff = new byte[ content.length ];
  
          int index = 0;
          for ( int i = 0; i < content.length; ++i ) {
  
              char c = (char)content[ i ];
              
              if ( isXMLCharacter( c ) ) {
  
                  //buff.append( c );
                  buff[index] = content[ i ];
                  ++index;
              } 
  
          }
  
          return buff;
  
      }
  
      /*
       * This is a utility function for determining whether a specified 
       * character is a character according to production 2 of the 
       * XML 1.0 specification.
       *
       * @param c <code>char</code> to check for XML compliance.
       * @return <code>boolean</code> - true if it's a character, 
       *                                false otherwise.
       */
      public static boolean isXMLCharacter(char c) {
  
          if (c == '\n') return true;
          if (c == '\r') return true;
          if (c == '\t') return true;
          
          if (c < 0x20) return false;  if (c < 0x80) return true;
          if (c < 0xFF) return false; if (c <= 0xD7FF) return true;
          if (c < 0xE000) return false;  if (c <= 0xFFFD) return true;
          if (c < 0x10000) return false;  if (c <= 0x10FFFF) return true;
          
          return false;
      }
  
  }
  
  
  
  1.1                  jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/tools/XMLEncodingParser.java
  
  Index: XMLEncodingParser.java
  ===================================================================
  /*
   * Copyright 1999,2004 The Apache Software Foundation.
   * 
   * Licensed under the Apache License, Version 2.0 (the "License");
   * you may not use this file except in compliance with the License.
   * You may obtain a copy of the License at
   * 
   *      http://www.apache.org/licenses/LICENSE-2.0
   * 
   * Unless required by applicable law or agreed to in writing, software
   * distributed under the License is distributed on an "AS IS" BASIS,
   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   * See the License for the specific language governing permissions and
   * limitations under the License.
   */
  
  package org.apache.commons.feedparser.tools;
  
  import java.io.*;
  import java.net.*;
  import java.util.*;
  
  /**
   *
   * Given an XML document pull out the encoding or null if not specified.
   *
   * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
   */
  public class XMLEncodingParser {
  
      public static final String ENCODING = "encoding=\"";
      
      /**
       *
       * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
       */
      public static String parse( byte[] content ) throws Exception {
  
          //this isn't really pretty but it is fast.
  
          //just use the first 100 bytes
  
          String str;
          
          if ( content.length > 100 ) {
              str = new String( content, 0, 100 );
          } else {
              str = new String( content );
          }
  
          int end = str.indexOf( ">" );
  
          if ( end == -1 )
              return null;
  
          String decl = str.substring( 0, end );
  
          int index = decl.indexOf( ENCODING );
          
          if ( index != -1 ) {
  
              String encoding = decl.substring( index + ENCODING.length(),
                                                decl.length() );
  
              end = encoding.indexOf( "\"" );
              
              if ( end == -1 )
                  return null;
  
              encoding = encoding.substring( 0, end);
  
              return encoding;
              
          }
  
          return null;
  
      }
      
  }
  
  
  

---------------------------------------------------------------------
To unsubscribe, e-mail: commons-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: commons-dev-help@jakarta.apache.org