You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@xalan.apache.org by sb...@apache.org on 2001/11/04 06:22:27 UTC

cvs commit: xml-xalan/java/src/org/apache/xalan/serialize Encodings.java SerializerToHTML.java SerializerToText.java SerializerToXML.java

sboag       01/11/03 21:22:27

  Modified:    java/src/org/apache/xalan/serialize Encodings.java
                        SerializerToHTML.java SerializerToText.java
                        SerializerToXML.java
  Log:
  Progress on Bugzilla Bug 1639 : Xalan escaping characters for ISO encodings other than ISO-8859-1.
  
  The problem is that you can't tell from the Java Writers if they can encode
  a character, and you can't tell which character they are going to encode to.
  So...
  
  Do a one-time-only reflection to see if a sun.io.CharToByteConverter for the
  specific encoding is available.  I'm hoping this will work for most or all
  platforms... but only some extensive testing will tell for sure.  If the
  CharToByteConverter is not available, it falls back to the old behavior.
  If it is available, use the canConvert method to see if the UTF-16 character
  can be encoded.  If it can be, just send it to the writer, otherwise escape it.
  This doesn't need to be done for < 128, so I'm suspecting the performance
  hit won't be too bad.
  
  The alternative is to create lookups for all the encodings that tell which
  blocks of characters can't be encoded.  Too much work for me this
  weekend, though it remains a possibility.  I want to try the use of
  CharToByteConverter.canConvert first.
  
  One problem I've encountered is that CharToByteConverter.canConvert
  returns true for (utf-16 code points) 127-159.  And so they go unescaped.
  How bad of a problem is this?  This seems like a bug in the Java
  CharToByteConverter for ISO-8859-7.
  
  It remains to be seen if this problem exists with other encodings, or if this is
  the best approach.
  
  Revision  Changes    Path
  1.8       +52 -4     xml-xalan/java/src/org/apache/xalan/serialize/Encodings.java
  
  Index: Encodings.java
  ===================================================================
  RCS file: /home/cvs/xml-xalan/java/src/org/apache/xalan/serialize/Encodings.java,v
  retrieving revision 1.7
  retrieving revision 1.8
  diff -u -r1.7 -r1.8
  --- Encodings.java	2001/03/11 23:55:39	1.7
  +++ Encodings.java	2001/11/04 05:22:27	1.8
  @@ -67,7 +67,7 @@
    * to override encoding names and provide the last printable character
    * for each encoding.
    *
  - * @version $Revision: 1.7 $ $Date: 2001/03/11 23:55:39 $
  + * @version $Revision: 1.8 $ $Date: 2001/11/04 05:22:27 $
    * @author <a href="mailto:arkin@intalio.com">Assaf Arkin</a>
    */
   public class Encodings extends Object
  @@ -121,6 +121,53 @@
         throw new UnsupportedEncodingException(encoding);
       }
     }
  +  
  +  /**
  +   * Returns an opaque CharToByte converter for the specified encoding.
  +   *
  +   * @param encoding The encoding
  +   * @return An object which should be a sun.io.CharToByteConverter, or null.
  +   */
  +  public static Object getCharToByteConverter(String encoding)
  +  {
  +
  +    Class charToByteConverterClass = null;
  +    java.lang.reflect.Method getConverterMethod = null;
  +    
  +    try
  +    {
  +      charToByteConverterClass = Class.forName("sun.io.CharToByteConverter");
  +      Class argTypes[] = new Class[1];
  +      argTypes[0] = String.class;
  +      getConverterMethod 
  +        = charToByteConverterClass.getMethod("getConverter", argTypes);
  +    }
  +    catch(Exception e)
  +    {
  +      System.err.println("Warning: Could not get charToByteConverterClass!");
  +      return null;
  +    }
  +    Object args[] = new Object[1];
  +    for (int i = 0; i < _encodings.length; ++i)
  +    {
  +      if (_encodings[i].name.equalsIgnoreCase(encoding))
  +      {
  +        try
  +        {
  +          args[0] = _encodings[i].javaName;
  +          Object converter = getConverterMethod.invoke(null, args);
  +          if(null != converter)
  +            return converter;
  +        }
  +        catch( Exception iae)
  +        {
  +          // keep trying
  +        }
  +      }
  +    }
  +
  +    return null;
  +  }
   
     /**
      * Returns the last printable character for the specified
  @@ -197,9 +244,10 @@
             */
             String jencoding =
               (encoding.equalsIgnoreCase("Cp1252") || encoding.equalsIgnoreCase(
  -            "ISO8859_1") || encoding.equalsIgnoreCase("8859_1") || encoding.equalsIgnoreCase("UTF8")) ? DEFAULT_MIME_ENCODING
  -                                                                                                      : convertJava2MimeEncoding(
  -                                                                                                        encoding);
  +            "ISO8859_1") || encoding.equalsIgnoreCase("8859_1") 
  +            || encoding.equalsIgnoreCase("UTF8")) ? DEFAULT_MIME_ENCODING
  +              : convertJava2MimeEncoding(
  +              encoding);
   
             encoding = (null != jencoding) ? jencoding : DEFAULT_MIME_ENCODING;
           }
  
  
  
  1.10      +6 -13     xml-xalan/java/src/org/apache/xalan/serialize/SerializerToHTML.java
  
  Index: SerializerToHTML.java
  ===================================================================
  RCS file: /home/cvs/xml-xalan/java/src/org/apache/xalan/serialize/SerializerToHTML.java,v
  retrieving revision 1.9
  retrieving revision 1.10
  diff -u -r1.9 -r1.10
  --- SerializerToHTML.java	2001/10/25 22:37:59	1.9
  +++ SerializerToHTML.java	2001/11/04 05:22:27	1.10
  @@ -981,7 +981,7 @@
             }
   
           }
  -        else if (ch < m_maxCharacter)
  +        else if (canConvert(ch))
           {
             accum(ch);
           }
  @@ -1060,7 +1060,7 @@
         // System.out.println("ch: "+(int)ch);
         // System.out.println("m_maxCharacter: "+(int)m_maxCharacter);
         // System.out.println("m_attrCharsMap[ch]: "+(int)m_attrCharsMap[ch]);
  -      if ((ch < m_maxCharacter) && (!m_charInfo.isSpecial(ch)))
  +      if (canConvert(ch) && (!m_charInfo.isSpecial(ch)))
         {
           accum(ch);
         }
  @@ -1113,22 +1113,15 @@
               accum(entityName);
               accum(';');
             }
  -          else if (ch < m_maxCharacter)
  +          else if (canConvert(ch))
             {
               accum(ch);  // no escaping in this case
             }
             else
             {
  -            if (ch < m_maxCharacter)
  -            {
  -              accum(ch);  // no escaping in this case
  -            }
  -            else
  -            {
  -              accum("&#");
  -              accum(Integer.toString(ch));
  -              accum(';');
  -            }
  +            accum("&#");
  +            accum(Integer.toString(ch));
  +            accum(';');
             }
           }
         }
  
  
  
  1.4       +0 -1      xml-xalan/java/src/org/apache/xalan/serialize/SerializerToText.java
  
  Index: SerializerToText.java
  ===================================================================
  RCS file: /home/cvs/xml-xalan/java/src/org/apache/xalan/serialize/SerializerToText.java,v
  retrieving revision 1.3
  retrieving revision 1.4
  diff -u -r1.3 -r1.4
  --- SerializerToText.java	2001/08/28 18:30:32	1.3
  +++ SerializerToText.java	2001/11/04 05:22:27	1.4
  @@ -385,7 +385,6 @@
             m_writer.write(c);
           }
   
  -        // This needs to go into a function... 
           else if (isUTF16Surrogate(c))
           {
             i = writeUTF16Surrogate(c, ch, i, end);
  
  
  
  1.11      +84 -8     xml-xalan/java/src/org/apache/xalan/serialize/SerializerToXML.java
  
  Index: SerializerToXML.java
  ===================================================================
  RCS file: /home/cvs/xml-xalan/java/src/org/apache/xalan/serialize/SerializerToXML.java,v
  retrieving revision 1.10
  retrieving revision 1.11
  diff -u -r1.10 -r1.11
  --- SerializerToXML.java	2001/10/10 18:42:24	1.10
  +++ SerializerToXML.java	2001/11/04 05:22:27	1.11
  @@ -304,7 +304,7 @@
     /**
      * Flag to quickly tell if the encoding is UTF8.
      */
  -  boolean m_isUTF8;
  +  boolean m_isUTF8 = false;
   
     /**
      * The maximum character size before we have to resort
  @@ -510,6 +510,7 @@
   
       if (m_encoding.equalsIgnoreCase("UTF-8"))
       {
  +      m_isUTF8 = true;
         if(output instanceof java.io.BufferedOutputStream)
         {
           init(new WriterToUTF8(output), format, true);
  @@ -1340,8 +1341,7 @@
         if (shouldIndent())
           indent(m_currentIndent);
   
  -      boolean writeCDataBrackets = (((length >= 1)
  -                                     && (ch[start] <= m_maxCharacter)));
  +      boolean writeCDataBrackets = (((length >= 1) && canConvert(ch[start])));
   
         if (writeCDataBrackets)
         {
  @@ -1564,7 +1564,7 @@
             checkWhite = false;
           }
     
  -        if (((ch < maxCharacter) && (!specialsMap.get(ch))) || ('"' == ch))
  +        if ((canConvert(ch) && (!specialsMap.get(ch))) || ('"' == ch))
           {
             lengthClean++;
           }
  @@ -1750,7 +1750,7 @@
         {
           m_writer.write(m_lineSep, 0, m_lineSepLen);
         }
  -      else if (isCData && (c > m_maxCharacter))
  +      else if (isCData && (!canConvert(c)))
         {
           if (i != 0)
             m_writer.write("]]>");
  @@ -1783,7 +1783,7 @@
         }
         else
         {
  -        if (c <= m_maxCharacter)
  +        if (canConvert(c))
           {
             m_writer.write(c);
           }
  @@ -2193,7 +2193,7 @@
           }
           else
           {
  -          if (ch > m_maxCharacter || (m_charInfo.isSpecial(ch)))
  +          if (!canConvert(ch) || (m_charInfo.isSpecial(ch)))
             {
               m_writer.write("&#");
               m_writer.write(Integer.toString(ch));
  @@ -2213,6 +2213,82 @@
   
       return pos;
     }
  +  
  +  /**
  +   * Opaque reference to the sun.io.CharToByteConverter for this 
  +   * encoding.
  +   */
  +  Object m_charToByteConverter = null;
  +  
  +  /**
  +   * Method reference to the sun.io.CharToByteConverter#canConvert method 
  +   * for this encoding.  Invalid if m_charToByteConverter is null.
  +   */
  +  java.lang.reflect.Method m_canConvertMeth;
  +  
  +  /**
  +   * Boolean that tells if we already tried to get the converter.
  +   */
  +  boolean m_triedToGetConverter = false;
  +  
  +  /**
  +   * Tell if this character can be written without escaping.
  +   */
  +  public boolean canConvert(char ch)
  +  {
  +    if(ch < 128)
  +    {
  +      if(ch >= 0x20 || (0x0A == ch || 0x0D == ch || 0x09 == ch) )
  +        return true;
  +      else
  +        return false;
  +    }
  +    
  +    if(null == m_charToByteConverter && false == m_triedToGetConverter)
  +    {
  +      m_triedToGetConverter = true;
  +      try
  +      {
  +        m_charToByteConverter = Encodings.getCharToByteConverter(m_encoding);
  +        if(null != m_charToByteConverter)
  +        {
  +          Class argsTypes[] = new Class[1];
  +          argsTypes[0] = Character.TYPE;
  +          Class convClass = m_charToByteConverter.getClass();
  +          m_canConvertMeth = convClass.getMethod("canConvert", argsTypes);
  +        }
  +      }
  +      catch(Exception e)
  +      {
  +       // This is just an assert: no action at the moment.
  +        System.err.println("Warning: "+e.getMessage());
  +      }
  +    }
  +    if(null != m_charToByteConverter)
  +    {
  +      try
  +      {
  +        Object args[] = new Object[1];
  +        args[0] = new Character( ch );
  +        Boolean bool 
  +          = (Boolean)m_canConvertMeth.invoke(m_charToByteConverter, args);
  +        return bool.booleanValue();
  +      }
  +      catch(java.lang.reflect.InvocationTargetException ite)
  +      {
  +        // This is just an assert: no action at the moment.
  +        System.err.println("Warning: InvocationTargetException in canConvert!");
  +      }
  +      catch(java.lang.IllegalAccessException iae)
  +      {
  +        // This is just an assert: no action at the moment.
  +        System.err.println("Warning: IllegalAccessException in canConvert!");
  +      }
  +    }
  +    // fallback!
  +    return ( ch <= m_maxCharacter );
  +  }
  +
   
     /**
      * Returns the specified <var>string</var> after substituting <VAR>specials</VAR>,
  @@ -2237,7 +2313,7 @@
         {
           char ch = stringChars[i];
     
  -        if ((ch < m_maxCharacter) && (!m_charInfo.isSpecial(ch)))
  +        if (canConvert(ch) && (!m_charInfo.isSpecial(ch)))
           {
             writer.write(ch);
           }
  
  
  

---------------------------------------------------------------------
To unsubscribe, e-mail: xalan-cvs-unsubscribe@xml.apache.org
For additional commands, e-mail: xalan-cvs-help@xml.apache.org