You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@directory.apache.org by el...@apache.org on 2005/04/03 19:49:36 UTC
svn commit: r159940 -
directory/sandbox/trunk/asn1-new-codec/src/java/org/apache/asn1/util/StringUtils.java
Author: elecharny
Date: Sun Apr 3 10:49:35 2005
New Revision: 159940
URL: http://svn.apache.org/viewcvs?view=rev&rev=159940
Log:
Added functions to deal with byte[] -> Unicode decoding. They are used by the MutableString and mainly in the decoder, to handle DN values.
Modified:
directory/sandbox/trunk/asn1-new-codec/src/java/org/apache/asn1/util/StringUtils.java
Modified: directory/sandbox/trunk/asn1-new-codec/src/java/org/apache/asn1/util/StringUtils.java
URL: http://svn.apache.org/viewcvs/directory/sandbox/trunk/asn1-new-codec/src/java/org/apache/asn1/util/StringUtils.java?view=diff&r1=159939&r2=159940
==============================================================================
--- directory/sandbox/trunk/asn1-new-codec/src/java/org/apache/asn1/util/StringUtils.java (original)
+++ directory/sandbox/trunk/asn1-new-codec/src/java/org/apache/asn1/util/StringUtils.java Sun Apr 3 10:49:35 2005
@@ -29,6 +29,24 @@
/** Hex chars */
private static final byte[] HEX =
new byte[] { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' };
+
+ private static int UTF8_MULTI_BYTES_MASK = 0x0080;
+
+ private static int UTF8_TWO_BYTES_MASK = 0x00E0;
+ private static int UTF8_TWO_BYTES = 0x00C0;
+
+ private static int UTF8_THREE_BYTES_MASK = 0x00F0;
+ private static int UTF8_THREE_BYTES = 0x00E0;
+
+ private static int UTF8_FOUR_BYTES_MASK = 0x00F8;
+ private static int UTF8_FOUR_BYTES = 0x00F0;
+
+ private static int UTF8_FIVE_BYTES_MASK = 0x00FC;
+ private static int UTF8_FIVE_BYTES = 0x00F8;
+
+ private static int UTF8_SIX_BYTES_MASK = 0x00FE;
+ private static int UTF8_SIX_BYTES = 0x00FC;
+
//~ Methods ------------------------------------------------------------------------------------
@@ -42,5 +60,195 @@
{
return new String(
new byte[] { '[', HEX[( octet & 0x00F0 ) >> 4], HEX[octet & 0x000F], ']' } );
+ }
+
+ /**
+ * Return the Unicode char which is coded in the bytes at position 0.
+ *
+ * @param bytes The byte[] represntation of an Unicode string.
+ * @return The first char found.
+ */
+ public static char bytesToChar(byte[] bytes)
+ {
+ return bytesToChar(bytes, 0);
+ }
+
+ /**
+ * Count the number of bytes needed to return an Unicode char. This
+ * can be from 1 to 6.
+ * @param bytes The bytes to read
+ * @param pos Position to start counting. It must be a valid start of a
+ * encoded char !
+ * @return The number of bytes to create a char, or -1 if the encoding is wrong.
+ *
+ * TODO : Should stop after the third byte, as a char is only 2 bytes long.
+ */
+ public static int countBytesPerChar(byte[] bytes, int pos)
+ {
+ if ((bytes[0] & UTF8_MULTI_BYTES_MASK) == 0)
+ {
+ return 1;
+ } else if ((bytes[0] & UTF8_TWO_BYTES_MASK) == UTF8_TWO_BYTES)
+ {
+ return 2;
+ }
+ else if ((bytes[0] & UTF8_THREE_BYTES_MASK) == UTF8_THREE_BYTES)
+ {
+ return 3;
+ }
+ else if ((bytes[0] & UTF8_FOUR_BYTES_MASK) == UTF8_FOUR_BYTES)
+ {
+ return 4;
+ }
+ else if ((bytes[0] & UTF8_FIVE_BYTES_MASK) == UTF8_FIVE_BYTES)
+ {
+ return 5;
+ }
+ else if ((bytes[0] & UTF8_SIX_BYTES_MASK) == UTF8_SIX_BYTES)
+ {
+ return 6;
+ }
+ else
+ {
+ return -1;
+ }
+ }
+
+ /**
+ * Return the Unicode char which is coded in the bytes at the given position.
+ * @param bytes The byte[] represntation of an Unicode string.
+ * @param pos The current position to start decoding the char
+ * @return The char found.
+ * @return The decoded char, or -1 if no char can be decoded
+ *
+ * TODO : Should stop after the third byte, as a char is only 2 bytes long.
+ */
+ public static char bytesToChar(byte[] bytes, int pos)
+ {
+ if ((bytes[pos] & UTF8_MULTI_BYTES_MASK) == 0)
+ {
+ return (char)bytes[pos];
+ }
+ else
+ {
+ if ((bytes[pos] & UTF8_TWO_BYTES_MASK) == UTF8_TWO_BYTES)
+ {
+ // Two bytes char
+ return (char)(
+ ( ( bytes[pos] & 0x1C ) << 6 ) + // 110x-xxyy 10zz-zzzz -> 0000-0xxx 0000-0000
+ ( ( bytes[pos] & 0x03 ) << 6 ) + // 110x-xxyy 10zz-zzzz -> 0000-0000 yy00-0000
+ ( bytes[pos + 1] & 0x3F ) // 110x-xxyy 10zz-zzzz -> 0000-0000 00zz-zzzz
+ ); // -> 0000-0xxx yyzz-zzzz (07FF)
+ }
+ else if ((bytes[pos] & UTF8_THREE_BYTES_MASK) == UTF8_THREE_BYTES)
+ {
+ // Three bytes char
+ return (char)(
+ // 1110-tttt 10xx-xxyy 10zz-zzzz -> tttt-0000-0000-0000
+ ( ( bytes[pos] & 0x0F) << 12 ) +
+ // 1110-tttt 10xx-xxyy 10zz-zzzz -> 0000-xxxx-0000-0000
+ ( ( bytes[pos + 1] & 0x3C) << 6 ) +
+ // 1110-tttt 10xx-xxyy 10zz-zzzz -> 0000-0000-yy00-0000
+ ( ( bytes[pos + 1] & 0x03) << 6 ) +
+ // 1110-tttt 10xx-xxyy 10zz-zzzz -> 0000-0000-00zz-zzzz
+ ( bytes[pos + 2] & 0x3F )
+ // -> tttt-xxxx yyzz-zzzz (FF FF)
+ );
+ }
+ else if ((bytes[pos] & UTF8_FOUR_BYTES_MASK) == UTF8_FOUR_BYTES)
+ {
+ // Four bytes char
+ return (char)(
+ // 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 000t-tt00 0000-0000 0000-0000
+ ( ( bytes[pos] & 0x07) << 18 ) +
+ // 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 0000-00uu 0000-0000 0000-0000
+ ( ( bytes[pos + 1] & 0x30) << 16 ) +
+ // 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 0000-0000 vvvv-0000 0000-0000
+ ( ( bytes[pos + 1] & 0x0F) << 12 ) +
+ // 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 0000-0000 0000-xxxx 0000-0000
+ ( ( bytes[pos + 2] & 0x3C) << 6 ) +
+ // 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 0000-0000 0000-0000 yy00-0000
+ ( ( bytes[pos + 2] & 0x03) << 6 ) +
+ // 1111-0ttt 10uu-vvvv 10xx-xxyy 10zz-zzzz -> 0000-0000 0000-0000 00zz-zzzz
+ ( bytes[pos + 3] & 0x3F )
+ // -> 000t-ttuu vvvv-xxxx yyzz-zzzz (1FFFFF)
+ );
+ }
+ else if ((bytes[pos] & UTF8_FIVE_BYTES_MASK) == UTF8_FIVE_BYTES)
+ {
+ // Five bytes char
+ return (char)(
+ // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz -> 0000-00tt 0000-0000 0000-0000 0000-0000
+ ( ( bytes[pos] & 0x03) << 24 ) +
+ // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz -> 0000-0000 uuuu-uu00 0000-0000 0000-0000
+ ( ( bytes[pos + 1] & 0x3F) << 18 ) +
+ // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz -> 0000-0000 0000-00vv 0000-0000 0000-0000
+ ( ( bytes[pos + 2] & 0x30) << 12 ) +
+ // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz -> 0000-0000 0000-0000 wwww-0000 0000-0000
+ ( ( bytes[pos + 2] & 0x0F) << 12 ) +
+ // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz -> 0000-0000 0000-0000 0000-xxxx 0000-0000
+ ( ( bytes[pos + 3] & 0x3C) << 6 ) +
+ // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz -> 0000-0000 0000-0000 0000-0000 yy00-0000
+ ( ( bytes[pos + 3] & 0x03) << 6 ) +
+ // 1111-10tt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz -> 0000-0000 0000-0000 0000-0000 00zz-zzzz
+ ( bytes[pos + 4] & 0x3F )
+ // -> 0000-00tt uuuu-uuvv wwww-xxxx yyzz-zzzz (03 FF FF FF)
+ );
+ }
+ else if ((bytes[pos] & UTF8_FIVE_BYTES_MASK) == UTF8_FIVE_BYTES)
+ {
+ // Six bytes char
+ return (char)(
+ // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz ->
+ // 0s00-0000 0000-0000 0000-0000 0000-0000
+ ( ( bytes[pos] & 0x01) << 30 ) +
+ // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz ->
+ // 00tt-tttt 0000-0000 0000-0000 0000-0000
+ ( ( bytes[pos + 1] & 0x3F) << 24 ) +
+ // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz ->
+ // 0000-0000 uuuu-uu00 0000-0000 0000-0000
+ ( ( bytes[pos + 2] & 0x3F) << 18 ) +
+ // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz ->
+ // 0000-0000 0000-00vv 0000-0000 0000-0000
+ ( ( bytes[pos + 3] & 0x30) << 12 ) +
+ // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz ->
+ // 0000-0000 0000-0000 wwww-0000 0000-0000
+ ( ( bytes[pos + 3] & 0x0F) << 12 ) +
+ // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz ->
+ // 0000-0000 0000-0000 0000-xxxx 0000-0000
+ ( ( bytes[pos + 4] & 0x3C) << 6 ) +
+ // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz ->
+ // 0000-0000 0000-0000 0000-0000 yy00-0000
+ ( ( bytes[pos + 4] & 0x03) << 6 ) +
+ // 1111-110s 10tt-tttt 10uu-uuuu 10vv-wwww 10xx-xxyy 10zz-zzzz ->
+ // 0000-0000 0000-0000 0000-0000 00zz-zzzz
+ ( bytes[pos + 5] & 0x3F )
+ // -> 0stt-tttt uuuu-uuvv wwww-xxxx yyzz-zzzz (7F FF FF FF)
+ );
+ }
+ else
+ {
+ return (char)-1;
+ }
+ }
+ }
+
+ /**
+ * Count the number of chars included in the given byte[].
+ * @param bytes The byte array to decode
+ * @return The number of char in the byte array
+ */
+ public static int countChars(byte[] bytes)
+ {
+ int nbChars = 0;
+ int currentPos = 0;
+
+ while (currentPos < bytes.length)
+ {
+ currentPos += countBytesPerChar(bytes, currentPos);
+ nbChars ++;
+ }
+
+ return nbChars;
}
}