You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@directory.apache.org by el...@apache.org on 2005/04/03 19:49:36 UTC
svn commit: r159940 - directory/sandbox/trunk/asn1-new-codec/src/java/org/apache/asn1/util/StringUtils.java

Author: elecharny
Date: Sun Apr  3 10:49:35 2005
New Revision: 159940

URL: http://svn.apache.org/viewcvs?view=rev&rev=159940
Log:
Added functions to deal with byte[] -> Unicode decoding. They are used by the MutableString and mainly in the decoder, to handle DN values.

Modified:
    directory/sandbox/trunk/asn1-new-codec/src/java/org/apache/asn1/util/StringUtils.java

Modified: directory/sandbox/trunk/asn1-new-codec/src/java/org/apache/asn1/util/StringUtils.java
URL: http://svn.apache.org/viewcvs/directory/sandbox/trunk/asn1-new-codec/src/java/org/apache/asn1/util/StringUtils.java?view=diff&r1=159939&r2=159940
==============================================================================
--- directory/sandbox/trunk/asn1-new-codec/src/java/org/apache/asn1/util/StringUtils.java (original)
+++ directory/sandbox/trunk/asn1-new-codec/src/java/org/apache/asn1/util/StringUtils.java Sun Apr  3 10:49:35 2005
@@ -29,6 +29,24 @@
     /** Hex chars */
     private static final byte[] HEX =
         new byte[] { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' };
+    
+    private static int UTF8_MULTI_BYTES_MASK = 0x0080;
+    
+    private static int UTF8_TWO_BYTES_MASK = 0x00E0;
+    private static int UTF8_TWO_BYTES = 0x00C0;
+    
+    private static int UTF8_THREE_BYTES_MASK = 0x00F0;
+    private static int UTF8_THREE_BYTES = 0x00E0;
+
+    private static int UTF8_FOUR_BYTES_MASK = 0x00F8;
+    private static int UTF8_FOUR_BYTES = 0x00F0;
+    
+    private static int UTF8_FIVE_BYTES_MASK = 0x00FC;
+    private static int UTF8_FIVE_BYTES = 0x00F8;
+
+    private static int UTF8_SIX_BYTES_MASK = 0x00FE;
+    private static int UTF8_SIX_BYTES = 0x00FC;
+    
 
     //~ Methods ------------------------------------------------------------------------------------
 
@@ -42,5 +60,195 @@
     {
         return new String(
                 new byte[] { '[', HEX[( octet & 0x00F0 ) >> 4], HEX[octet & 0x000F], ']' } );
+    }
+    
+    /**
+     * Return the Unicode char which is coded in the bytes at position 0.
+     * 
+     * @param bytes The byte[] represntation of an Unicode string. 
+     * @return The first char found.
+     */
+    public static char bytesToChar(byte[] bytes)
+    {
+        return bytesToChar(bytes, 0);
+    }
+
+    /**
+     * Count the number of bytes needed to return an Unicode char. This
+     * can be from 1 to 6. 
+     * @param bytes The bytes to read
+     * @param pos Position to start counting. It must be a valid start of a 
+     * encoded char !
+     * @return The number of bytes to create a char, or -1 if the encoding is wrong.
+     * 
+     * TODO : Should stop after the third byte, as a char is only 2 bytes long.
+     */
+    public static int countBytesPerChar(byte[] bytes, int pos)
+    {
+        if ((bytes[0] & UTF8_MULTI_BYTES_MASK) == 0)
+        {
+            return 1;
+        } else if ((bytes[0] & UTF8_TWO_BYTES_MASK) == UTF8_TWO_BYTES)
+    	{
+            return 2;
+    	}
+    	else if ((bytes[0] & UTF8_THREE_BYTES_MASK) == UTF8_THREE_BYTES)
+    	{
+    	    return 3;
+    	}
+    	else if ((bytes[0] & UTF8_FOUR_BYTES_MASK) == UTF8_FOUR_BYTES)
+    	{
+    	    return 4;
+    	}
+    	else if ((bytes[0] & UTF8_FIVE_BYTES_MASK) == UTF8_FIVE_BYTES)
+    	{
+    	    return 5;
+    	}
+    	else if ((bytes[0] & UTF8_SIX_BYTES_MASK) == UTF8_SIX_BYTES)
+    	{
+    	    return 6;
+        } 
+    	else
+    	{
+    	    return -1;
+    	}
+    }
+    
+    /**
+     * Return the Unicode char which is coded in the bytes at the given position. 
+     * @param bytes The byte[] represntation of an Unicode string. 
+     * @param pos The current position to start decoding the char
+     * @return The char found.
+     * @return The decoded char, or -1 if no char can be decoded
+     * 
+     * TODO : Should stop after the third byte, as a char is only 2 bytes long.
+     */
+    public static char bytesToChar(byte[] bytes, int pos)
+    {
+    	if ((bytes[pos] & UTF8_MULTI_BYTES_MASK) == 0)
+		{
+    		return (char)bytes[pos];
+		}
+    	else
+    	{
+    		if ((bytes[pos] & UTF8_TWO_BYTES_MASK) == UTF8_TWO_BYTES)
+    		{
+    			// Two bytes char
+    			return (char)( 
+    					( ( bytes[pos] & 0x1C ) << 6 ) + 	// 110x-xxyy 10zz-zzzz -> 0000-0xxx 0000-0000
+    					( ( bytes[pos] & 0x03 ) << 6 ) + 	// 110x-xxyy 10zz-zzzz -> 0000-0000 yy00-0000
+						( bytes[pos + 1] & 0x3F ) 		   	// 110x-xxyy 10zz-zzzz -> 0000-0000 00zz-zzzz
+						); 								//                     -> 0000-0xxx yyzz-zzzz (07FF)
+    		}
+    		else if ((bytes[pos] & UTF8_THREE_BYTES_MASK) == UTF8_THREE_BYTES)
+    		{
+    			// Three bytes char
+    			return (char)( 
+    					// 1110-tttt 10xx-xxyy 10zz-zzzz -> tttt-0000-0000-0000
+    					( ( bytes[pos] & 0x0F) << 12 ) + 	
+						// 1110-tttt 10xx-xxyy 10zz-zzzz -> 0000-xxxx-0000-0000
+    					( ( bytes[pos + 1] & 0x3C) << 6 ) + 	
+						// 1110-tttt 10xx-xxyy 10zz-zzzz -> 0000-0000-yy00-0000
+    					( ( bytes[pos + 1] & 0x03) << 6 ) + 	
+						// 1110-tttt 10xx-xxyy 10zz-zzzz -> 0000-0000-00zz-zzzz
+						( bytes[pos + 2] & 0x3F )				
+						//                               -> tttt-xxxx yyzz-zzzz (FF FF)
+						);   							 
+    		}
+    		else if ((bytes[pos] & UTF8_FOUR_BYTES_MASK) == UTF8_FOUR_BYTES)
+    		{
+    			// Four bytes char
+    			return (char)(
+    					// 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 000t-tt00 0000-0000 0000-0000
+    					( ( bytes[pos] & 0x07) << 18 ) +
+						// 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 0000-00uu 0000-0000 0000-0000
+    					( ( bytes[pos + 1] & 0x30) << 16 ) + 
+						// 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 0000-0000 vvvv-0000 0000-0000
+    					( ( bytes[pos + 1] & 0x0F) << 12 ) + 
+						// 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 0000-0000 0000-xxxx 0000-0000
+    					( ( bytes[pos + 2] & 0x3C) << 6 ) + 
+						// 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 0000-0000 0000-0000 yy00-0000
+    					( ( bytes[pos + 2] & 0x03) << 6 ) +
+						// 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 0000-0000 0000-0000 00zz-zzzz
+						( bytes[pos + 3] & 0x3F )
+						//                                         -> 000t-ttuu vvvv-xxxx yyzz-zzzz (1FFFFF)
+						);   
+    		}
+    		else if ((bytes[pos] & UTF8_FIVE_BYTES_MASK) == UTF8_FIVE_BYTES)
+    		{
+    			// Five bytes char
+    			return (char)( 
+    					// 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz -> 0000-00tt 0000-0000 0000-0000 0000-0000
+    					( ( bytes[pos] & 0x03) << 24 ) + 
+    					// 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz -> 0000-0000 uuuu-uu00 0000-0000 0000-0000
+    					( ( bytes[pos + 1] & 0x3F) << 18 ) + 
+    					// 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz -> 0000-0000 0000-00vv 0000-0000 0000-0000
+    					( ( bytes[pos + 2] & 0x30) << 12 ) + 
+    					// 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz -> 0000-0000 0000-0000 wwww-0000 0000-0000
+    					( ( bytes[pos + 2] & 0x0F) << 12 ) + 
+    					// 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz -> 0000-0000 0000-0000 0000-xxxx 0000-0000
+    					( ( bytes[pos + 3] & 0x3C) << 6 ) + 
+    					// 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz -> 0000-0000 0000-0000 0000-0000 yy00-0000
+    					( ( bytes[pos + 3] & 0x03) << 6 ) + 
+						// 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz -> 0000-0000 0000-0000 0000-0000 00zz-zzzz
+						( bytes[pos + 4] & 0x3F )
+						// -> 0000-00tt uuuu-uuvv wwww-xxxx yyzz-zzzz (03 FF FF FF)
+						);   
+    		}
+    		else if ((bytes[pos] & UTF8_FIVE_BYTES_MASK) == UTF8_FIVE_BYTES)
+    		{
+    			// Six bytes char
+    			return (char)( 
+    			        // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz ->
+    			        // 0s00-0000 0000-0000 0000-0000 0000-0000
+    					( ( bytes[pos] & 0x01) << 30 ) + 
+    			        // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz ->
+    			        // 00tt-tttt 0000-0000 0000-0000 0000-0000
+    					( ( bytes[pos + 1] & 0x3F) << 24 ) + 
+    			        // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz ->
+    			        // 0000-0000 uuuu-uu00 0000-0000 0000-0000
+    					( ( bytes[pos + 2] & 0x3F) << 18 ) + 
+    			        // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz ->
+    			        // 0000-0000 0000-00vv 0000-0000 0000-0000
+    					( ( bytes[pos + 3] & 0x30) << 12 ) + 
+    			        // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz ->
+    			        // 0000-0000 0000-0000 wwww-0000 0000-0000
+    					( ( bytes[pos + 3] & 0x0F) << 12 ) + 
+    			        // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz ->
+    					// 0000-0000 0000-0000 0000-xxxx 0000-0000
+    					( ( bytes[pos + 4] & 0x3C) << 6 ) + 
+    			        // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz ->
+    					// 0000-0000 0000-0000 0000-0000 yy00-0000
+    					( ( bytes[pos + 4] & 0x03) << 6 ) + 
+    			        // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz ->
+    					// 0000-0000 0000-0000 0000-0000 00zz-zzzz
+						( bytes[pos + 5] & 0x3F )
+    			        // -> 0stt-tttt uuuu-uuvv wwww-xxxx yyzz-zzzz (7F FF FF FF)
+						);   
+    		} 
+    		else
+    		{
+    		    return (char)-1;
+    		}
+    	}
+    }
+    
+    /**
+     * Count the number of chars included in the given byte[].  
+     * @param bytes The byte array to decode
+     * @return The number of char in the byte array
+     */
+    public static int countChars(byte[] bytes)
+    {
+        int nbChars = 0;
+        int currentPos = 0;
+        
+        while (currentPos < bytes.length)
+        {
+            currentPos += countBytesPerChar(bytes, currentPos);
+            nbChars ++;
+        }
+
+        return nbChars;
     }
 }