You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@commons.apache.org by se...@apache.org on 2010/09/10 18:33:42 UTC
svn commit: r995859 [30/30] - in
/commons/proper/sanselan/trunk/src/main/java/org/apache/sanselan: ./ color/
common/ common/byteSources/ common/mylzw/ formats/bmp/
formats/bmp/pixelparsers/ formats/bmp/writers/ formats/gif/ formats/ico/
formats/jpeg/ f...
Modified: commons/proper/sanselan/trunk/src/main/java/org/apache/sanselan/util/UnicodeUtils.java
URL: http://svn.apache.org/viewvc/commons/proper/sanselan/trunk/src/main/java/org/apache/sanselan/util/UnicodeUtils.java?rev=995859&r1=995858&r2=995859&view=diff
==============================================================================
--- commons/proper/sanselan/trunk/src/main/java/org/apache/sanselan/util/UnicodeUtils.java (original)
+++ commons/proper/sanselan/trunk/src/main/java/org/apache/sanselan/util/UnicodeUtils.java Fri Sep 10 16:33:35 2010
@@ -23,442 +23,442 @@ import org.apache.sanselan.common.Binary
public abstract class UnicodeUtils implements BinaryConstants
{
- /**
- * This class should never be instantiated.
- */
- private UnicodeUtils()
- {
- }
-
- public static class UnicodeException extends Exception
- {
- public UnicodeException(String message)
- {
- super(message);
- }
- }
-
- // A default single-byte charset.
- public static final int CHAR_ENCODING_CODE_ISO_8859_1 = 0;
- public static final int CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_WITH_BOM = 1;
- public static final int CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_WITH_BOM = 2;
- public static final int CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_NO_BOM = 3;
- public static final int CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_NO_BOM = 4;
- public static final int CHAR_ENCODING_CODE_UTF_8 = 5;
- public static final int CHAR_ENCODING_CODE_AMBIGUOUS = -1;
-
- // /*
- // * Guess the character encoding of arbitrary character data in a data
- // * buffer.
- // *
- // * The data may not run to the end of the buffer; it may be terminated.
- // This
- // * makes the problem much harder, since the character data may be followed
- // * by arbitrary data.
- // */
- // public static int guessCharacterEncoding(byte bytes[], int index)
- // {
- // int length = bytes.length - index;
- //
- // if (length < 1)
- // return CHAR_ENCODING_CODE_AMBIGUOUS;
- //
- // if (length >= 2)
- // {
- // // look for BOM.
- //
- // int c1 = 0xff & bytes[index];
- // int c2 = 0xff & bytes[index + 1];
- // if (c1 == 0xFF && c2 == 0xFE)
- // return CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_WITH_BOM;
- // else if (c1 == 0xFE && c2 == 0xFF)
- // return CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_WITH_BOM;
- // }
- //
- // }
- //
- // /*
- // * Guess the character encoding of arbitrary character data in a data
- // * buffer.
- // *
- // * The data fills the entire buffer. If it is terminated, the terminator
- // * byte(s) will be the last bytes in the buffer.
- // *
- // * This makes the problem a bit easier.
- // */
- // public static int guessCharacterEncodingSimple(byte bytes[], int index)
- // throws UnicodeException
- // {
- // int length = bytes.length - index;
- //
- // if (length < 1)
- // return CHAR_ENCODING_CODE_AMBIGUOUS;
- //
- // if (length >= 2)
- // {
- // // identify or eliminate UTF-16 with a BOM.
- //
- // int c1 = 0xff & bytes[index];
- // int c2 = 0xff & bytes[index + 1];
- // if (c1 == 0xFF && c2 == 0xFE)
- // return CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_WITH_BOM;
- // else if (c1 == 0xFE && c2 == 0xFF)
- // return CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_WITH_BOM;
- // }
- //
- // if (length >= 2)
- // {
- // // look for optional double-byte terminator.
- //
- // int c1 = 0xff & bytes[bytes.length - 2];
- // int c2 = 0xff & bytes[bytes.length - 1];
- // if (c1 == 0 && c2 == 0)
- // {
- // // definitely a flavor of UTF-16.
- // if (length % 2 != 0)
- // throw new UnicodeException(
- // "Character data with double-byte terminator has an odd length.");
- //
- // boolean mayHaveTerminator = true;
- // boolean mustHaveTerminator = false;
- // boolean possibleBigEndian = new UnicodeMetricsUTF16NoBOM(
- // BYTE_ORDER_BIG_ENDIAN).isValid(bytes, index,
- // mayHaveTerminator, mustHaveTerminator);
- // boolean possibleLittleEndian = new UnicodeMetricsUTF16NoBOM(
- // BYTE_ORDER_LITTLE_ENDIAN).isValid(bytes, index,
- // mayHaveTerminator, mustHaveTerminator);
- // if ((!possibleBigEndian) && (!possibleLittleEndian))
- // throw new UnicodeException(
- // "Invalid character data, possibly UTF-16.");
- // if (possibleBigEndian && possibleLittleEndian)
- // return CHAR_ENCODING_CODE_AMBIGUOUS;
- // if (possibleBigEndian)
- // return CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_NO_BOM;
- // if (possibleLittleEndian)
- // return CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_NO_BOM;
- // }
- // }
- //
- // List possibleEncodings = new ArrayList();
- // if (length % 2 == 0)
- // {
- // boolean mayHaveTerminator = true;
- // boolean mustHaveTerminator = false;
- // boolean possibleBigEndian = new UnicodeMetricsUTF16NoBOM(
- // BYTE_ORDER_BIG_ENDIAN).isValid(bytes, index,
- // mayHaveTerminator, mustHaveTerminator);
- // boolean possibleLittleEndian = new UnicodeMetricsUTF16NoBOM(
- // BYTE_ORDER_LITTLE_ENDIAN).isValid(bytes, index,
- // mayHaveTerminator, mustHaveTerminator);
- //
- // if (possibleBigEndian)
- // return CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_NO_BOM;
- // if (possibleLittleEndian)
- // return CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_NO_BOM;
- // }
- //
- // }
-
- public static final boolean isValidISO_8859_1(String s)
- {
- try
- {
- String roundtrip = new String(s.getBytes("ISO-8859-1"),
- "ISO-8859-1");
- return s.equals(roundtrip);
- } catch (UnsupportedEncodingException e)
- {
- // should never be thrown.
- throw new RuntimeException("Error parsing string.", e);
- }
- }
-
- /*
- * Return the index of the first utf-16 terminator (ie. two even-aligned
- * nulls). If not found, return -1.
- */
- private static int findFirstDoubleByteTerminator(byte bytes[], int index)
- {
- for (int i = index; i < bytes.length - 1; i += 2)
- {
- int c1 = 0xff & bytes[index];
- int c2 = 0xff & bytes[index + 1];
- if (c1 == 0 && c2 == 0)
- return i;
- }
- return -1;
- }
-
- public final int findEndWithTerminator(byte bytes[], int index)
- throws UnicodeException
- {
- return findEnd(bytes, index, true);
- }
-
- public final int findEndWithoutTerminator(byte bytes[], int index)
- throws UnicodeException
- {
- return findEnd(bytes, index, false);
- }
-
- protected abstract int findEnd(byte bytes[], int index,
- boolean includeTerminator) throws UnicodeException;
-
- public static UnicodeUtils getInstance(int charEncodingCode)
- throws UnicodeException
- {
- switch (charEncodingCode)
- {
- case CHAR_ENCODING_CODE_ISO_8859_1:
- return new UnicodeMetricsASCII();
- case CHAR_ENCODING_CODE_UTF_8:
- // Debug.debug("CHAR_ENCODING_CODE_UTF_8");
- return new UnicodeMetricsUTF8();
- case CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_WITH_BOM:
- case CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_WITH_BOM:
- // Debug.debug("CHAR_ENCODING_CODE_UTF_16_WITH_BOM");
- return new UnicodeMetricsUTF16WithBOM();
- case CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_NO_BOM:
- return new UnicodeMetricsUTF16NoBOM(BYTE_ORDER_BIG_ENDIAN);
- case CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_NO_BOM:
- return new UnicodeMetricsUTF16NoBOM(BYTE_ORDER_LITTLE_ENDIAN);
- default:
- throw new UnicodeException("Unknown char encoding code: "
- + charEncodingCode);
- }
- }
-
- private static class UnicodeMetricsASCII extends UnicodeUtils
- {
- public int findEnd(byte bytes[], int index, boolean includeTerminator)
- throws UnicodeException
- {
- for (int i = index; i < bytes.length; i++)
- {
- if (bytes[i] == 0)
- return includeTerminator ? i + 1 : i;
- }
- return bytes.length;
- // throw new UnicodeException("Terminator not found.");
- }
- }
-
- // private static class UnicodeMetricsISO_8859_1 extends UnicodeUtils
- // {
- // public int findEnd(byte bytes[], int index, boolean includeTerminator)
- // throws UnicodeException
- // {
- // for (int i = index; i < bytes.length; i++)
- // {
- // if (bytes[i] == 0)
- // return includeTerminator ? i + 1 : i;
- // }
- // return bytes.length;
- // // throw new UnicodeException("Terminator not found.");
- // }
- // }
-
- private static class UnicodeMetricsUTF8 extends UnicodeUtils
- {
-
- public int findEnd(byte bytes[], int index, boolean includeTerminator)
- throws UnicodeException
- {
- // http://en.wikipedia.org/wiki/UTF-8
-
- while (true)
- {
- if (index == bytes.length)
- return bytes.length;
- if (index > bytes.length)
- throw new UnicodeException("Terminator not found.");
-
- int c1 = 0xff & bytes[index++];
- if (c1 == 0)
- return includeTerminator ? index : index - 1;
- else if (c1 <= 0x7f)
- continue;
- else if (c1 <= 0xDF)
- {
- if (index >= bytes.length)
- throw new UnicodeException("Invalid unicode.");
-
- int c2 = 0xff & bytes[index++];
- if (c2 < 0x80 || c2 > 0xBF)
- throw new UnicodeException("Invalid code point.");
- } else if (c1 <= 0xEF)
- {
- if (index >= bytes.length - 1)
- throw new UnicodeException("Invalid unicode.");
-
- int c2 = 0xff & bytes[index++];
- if (c2 < 0x80 || c2 > 0xBF)
- throw new UnicodeException("Invalid code point.");
- int c3 = 0xff & bytes[index++];
- if (c3 < 0x80 || c3 > 0xBF)
- throw new UnicodeException("Invalid code point.");
- } else if (c1 <= 0xF4)
- {
- if (index >= bytes.length - 2)
- throw new UnicodeException("Invalid unicode.");
-
- int c2 = 0xff & bytes[index++];
- if (c2 < 0x80 || c2 > 0xBF)
- throw new UnicodeException("Invalid code point.");
- int c3 = 0xff & bytes[index++];
- if (c3 < 0x80 || c3 > 0xBF)
- throw new UnicodeException("Invalid code point.");
- int c4 = 0xff & bytes[index++];
- if (c4 < 0x80 || c4 > 0xBF)
- throw new UnicodeException("Invalid code point.");
- } else
- throw new UnicodeException("Invalid code point.");
- }
- }
- }
-
- private abstract static class UnicodeMetricsUTF16 extends UnicodeUtils
- {
- protected static final int BYTE_ORDER_BIG_ENDIAN = 0;
- protected static final int BYTE_ORDER_LITTLE_ENDIAN = 1;
- protected int byteOrder = BYTE_ORDER_BIG_ENDIAN;
-
- public UnicodeMetricsUTF16(int byteOrder)
- {
- this.byteOrder = byteOrder;
- }
-
- public boolean isValid(byte bytes[], int index,
- boolean mayHaveTerminator, boolean mustHaveTerminator)
- throws UnicodeException
- {
- // http://en.wikipedia.org/wiki/UTF-16/UCS-2
-
- while (true)
- {
- if (index == bytes.length)
- {
- // end of buffer, no terminator found.
- return !mustHaveTerminator;
- }
-
- if (index >= bytes.length - 1)
- {
- // end of odd-length buffer, no terminator found.
- return false;
- }
-
- int c1 = 0xff & bytes[index++];
- int c2 = 0xff & bytes[index++];
- int msb1 = byteOrder == BYTE_ORDER_BIG_ENDIAN ? c1 : c2;
-
- if (c1 == 0 && c2 == 0)
- {
- // terminator found.
- return mayHaveTerminator;
- }
-
- if (msb1 >= 0xD8)
- {
- // Surrogate pair found.
-
- if (msb1 >= 0xDC)
- {
- // invalid first surrogate.
- return false;
- }
-
- if (index >= bytes.length - 1)
- {
- // missing second surrogate.
- return false;
- }
-
- // second word.
- int c3 = 0xff & bytes[index++];
- int c4 = 0xff & bytes[index++];
- int msb2 = byteOrder == BYTE_ORDER_BIG_ENDIAN ? c3 : c4;
- if (msb2 < 0xDC)
- {
- // invalid second surrogate.
- return false;
- }
- }
- }
- }
-
- public int findEnd(byte bytes[], int index, boolean includeTerminator)
- throws UnicodeException
- {
- // http://en.wikipedia.org/wiki/UTF-16/UCS-2
-
- while (true)
- {
- if (index == bytes.length)
- return bytes.length;
- if (index > bytes.length - 1)
- throw new UnicodeException("Terminator not found.");
-
- int c1 = 0xff & bytes[index++];
- int c2 = 0xff & bytes[index++];
- int msb1 = byteOrder == BYTE_ORDER_BIG_ENDIAN ? c1 : c2;
-
- if (c1 == 0 && c2 == 0)
- {
- return includeTerminator ? index : index - 2;
- } else if (msb1 >= 0xD8)
- {
- if (index > bytes.length - 1)
- throw new UnicodeException("Terminator not found.");
-
- // second word.
- int c3 = 0xff & bytes[index++];
- int c4 = 0xff & bytes[index++];
- int msb2 = byteOrder == BYTE_ORDER_BIG_ENDIAN ? c3 : c4;
- if (msb2 < 0xDC)
- throw new UnicodeException("Invalid code point.");
- }
- }
- }
- }
-
- private static class UnicodeMetricsUTF16NoBOM extends UnicodeMetricsUTF16
- {
-
- public UnicodeMetricsUTF16NoBOM(final int byteOrder)
- {
- super(byteOrder);
- }
-
- }
-
- private static class UnicodeMetricsUTF16WithBOM extends UnicodeMetricsUTF16
- {
-
- public UnicodeMetricsUTF16WithBOM()
- {
- super(BYTE_ORDER_BIG_ENDIAN);
- }
-
- public int findEnd(byte bytes[], int index, boolean includeTerminator)
- throws UnicodeException
- {
- // http://en.wikipedia.org/wiki/UTF-16/UCS-2
-
- if (index >= bytes.length - 1)
- throw new UnicodeException("Missing BOM.");
-
- int c1 = 0xff & bytes[index++];
- int c2 = 0xff & bytes[index++];
- if (c1 == 0xFF && c2 == 0xFE)
- byteOrder = BYTE_ORDER_LITTLE_ENDIAN;
- else if (c1 == 0xFE && c2 == 0xFF)
- byteOrder = BYTE_ORDER_BIG_ENDIAN;
- else
- throw new UnicodeException("Invalid byte order mark.");
-
- return super.findEnd(bytes, index, includeTerminator);
- }
- }
+ /**
+ * This class should never be instantiated.
+ */
+ private UnicodeUtils()
+ {
+ }
+
+ public static class UnicodeException extends Exception
+ {
+ public UnicodeException(String message)
+ {
+ super(message);
+ }
+ }
+
+ // A default single-byte charset.
+ public static final int CHAR_ENCODING_CODE_ISO_8859_1 = 0;
+ public static final int CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_WITH_BOM = 1;
+ public static final int CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_WITH_BOM = 2;
+ public static final int CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_NO_BOM = 3;
+ public static final int CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_NO_BOM = 4;
+ public static final int CHAR_ENCODING_CODE_UTF_8 = 5;
+ public static final int CHAR_ENCODING_CODE_AMBIGUOUS = -1;
+
+ // /*
+ // * Guess the character encoding of arbitrary character data in a data
+ // * buffer.
+ // *
+ // * The data may not run to the end of the buffer; it may be terminated.
+ // This
+ // * makes the problem much harder, since the character data may be followed
+ // * by arbitrary data.
+ // */
+ // public static int guessCharacterEncoding(byte bytes[], int index)
+ // {
+ // int length = bytes.length - index;
+ //
+ // if (length < 1)
+ // return CHAR_ENCODING_CODE_AMBIGUOUS;
+ //
+ // if (length >= 2)
+ // {
+ // // look for BOM.
+ //
+ // int c1 = 0xff & bytes[index];
+ // int c2 = 0xff & bytes[index + 1];
+ // if (c1 == 0xFF && c2 == 0xFE)
+ // return CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_WITH_BOM;
+ // else if (c1 == 0xFE && c2 == 0xFF)
+ // return CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_WITH_BOM;
+ // }
+ //
+ // }
+ //
+ // /*
+ // * Guess the character encoding of arbitrary character data in a data
+ // * buffer.
+ // *
+ // * The data fills the entire buffer. If it is terminated, the terminator
+ // * byte(s) will be the last bytes in the buffer.
+ // *
+ // * This makes the problem a bit easier.
+ // */
+ // public static int guessCharacterEncodingSimple(byte bytes[], int index)
+ // throws UnicodeException
+ // {
+ // int length = bytes.length - index;
+ //
+ // if (length < 1)
+ // return CHAR_ENCODING_CODE_AMBIGUOUS;
+ //
+ // if (length >= 2)
+ // {
+ // // identify or eliminate UTF-16 with a BOM.
+ //
+ // int c1 = 0xff & bytes[index];
+ // int c2 = 0xff & bytes[index + 1];
+ // if (c1 == 0xFF && c2 == 0xFE)
+ // return CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_WITH_BOM;
+ // else if (c1 == 0xFE && c2 == 0xFF)
+ // return CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_WITH_BOM;
+ // }
+ //
+ // if (length >= 2)
+ // {
+ // // look for optional double-byte terminator.
+ //
+ // int c1 = 0xff & bytes[bytes.length - 2];
+ // int c2 = 0xff & bytes[bytes.length - 1];
+ // if (c1 == 0 && c2 == 0)
+ // {
+ // // definitely a flavor of UTF-16.
+ // if (length % 2 != 0)
+ // throw new UnicodeException(
+ // "Character data with double-byte terminator has an odd length.");
+ //
+ // boolean mayHaveTerminator = true;
+ // boolean mustHaveTerminator = false;
+ // boolean possibleBigEndian = new UnicodeMetricsUTF16NoBOM(
+ // BYTE_ORDER_BIG_ENDIAN).isValid(bytes, index,
+ // mayHaveTerminator, mustHaveTerminator);
+ // boolean possibleLittleEndian = new UnicodeMetricsUTF16NoBOM(
+ // BYTE_ORDER_LITTLE_ENDIAN).isValid(bytes, index,
+ // mayHaveTerminator, mustHaveTerminator);
+ // if ((!possibleBigEndian) && (!possibleLittleEndian))
+ // throw new UnicodeException(
+ // "Invalid character data, possibly UTF-16.");
+ // if (possibleBigEndian && possibleLittleEndian)
+ // return CHAR_ENCODING_CODE_AMBIGUOUS;
+ // if (possibleBigEndian)
+ // return CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_NO_BOM;
+ // if (possibleLittleEndian)
+ // return CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_NO_BOM;
+ // }
+ // }
+ //
+ // List possibleEncodings = new ArrayList();
+ // if (length % 2 == 0)
+ // {
+ // boolean mayHaveTerminator = true;
+ // boolean mustHaveTerminator = false;
+ // boolean possibleBigEndian = new UnicodeMetricsUTF16NoBOM(
+ // BYTE_ORDER_BIG_ENDIAN).isValid(bytes, index,
+ // mayHaveTerminator, mustHaveTerminator);
+ // boolean possibleLittleEndian = new UnicodeMetricsUTF16NoBOM(
+ // BYTE_ORDER_LITTLE_ENDIAN).isValid(bytes, index,
+ // mayHaveTerminator, mustHaveTerminator);
+ //
+ // if (possibleBigEndian)
+ // return CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_NO_BOM;
+ // if (possibleLittleEndian)
+ // return CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_NO_BOM;
+ // }
+ //
+ // }
+
+ public static final boolean isValidISO_8859_1(String s)
+ {
+ try
+ {
+ String roundtrip = new String(s.getBytes("ISO-8859-1"),
+ "ISO-8859-1");
+ return s.equals(roundtrip);
+ } catch (UnsupportedEncodingException e)
+ {
+ // should never be thrown.
+ throw new RuntimeException("Error parsing string.", e);
+ }
+ }
+
+ /*
+ * Return the index of the first utf-16 terminator (ie. two even-aligned
+ * nulls). If not found, return -1.
+ */
+ private static int findFirstDoubleByteTerminator(byte bytes[], int index)
+ {
+ for (int i = index; i < bytes.length - 1; i += 2)
+ {
+ int c1 = 0xff & bytes[index];
+ int c2 = 0xff & bytes[index + 1];
+ if (c1 == 0 && c2 == 0)
+ return i;
+ }
+ return -1;
+ }
+
+ public final int findEndWithTerminator(byte bytes[], int index)
+ throws UnicodeException
+ {
+ return findEnd(bytes, index, true);
+ }
+
+ public final int findEndWithoutTerminator(byte bytes[], int index)
+ throws UnicodeException
+ {
+ return findEnd(bytes, index, false);
+ }
+
+ protected abstract int findEnd(byte bytes[], int index,
+ boolean includeTerminator) throws UnicodeException;
+
+ public static UnicodeUtils getInstance(int charEncodingCode)
+ throws UnicodeException
+ {
+ switch (charEncodingCode)
+ {
+ case CHAR_ENCODING_CODE_ISO_8859_1:
+ return new UnicodeMetricsASCII();
+ case CHAR_ENCODING_CODE_UTF_8:
+ // Debug.debug("CHAR_ENCODING_CODE_UTF_8");
+ return new UnicodeMetricsUTF8();
+ case CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_WITH_BOM:
+ case CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_WITH_BOM:
+ // Debug.debug("CHAR_ENCODING_CODE_UTF_16_WITH_BOM");
+ return new UnicodeMetricsUTF16WithBOM();
+ case CHAR_ENCODING_CODE_UTF_16_BIG_ENDIAN_NO_BOM:
+ return new UnicodeMetricsUTF16NoBOM(BYTE_ORDER_BIG_ENDIAN);
+ case CHAR_ENCODING_CODE_UTF_16_LITTLE_ENDIAN_NO_BOM:
+ return new UnicodeMetricsUTF16NoBOM(BYTE_ORDER_LITTLE_ENDIAN);
+ default:
+ throw new UnicodeException("Unknown char encoding code: "
+ + charEncodingCode);
+ }
+ }
+
+ private static class UnicodeMetricsASCII extends UnicodeUtils
+ {
+ public int findEnd(byte bytes[], int index, boolean includeTerminator)
+ throws UnicodeException
+ {
+ for (int i = index; i < bytes.length; i++)
+ {
+ if (bytes[i] == 0)
+ return includeTerminator ? i + 1 : i;
+ }
+ return bytes.length;
+ // throw new UnicodeException("Terminator not found.");
+ }
+ }
+
+ // private static class UnicodeMetricsISO_8859_1 extends UnicodeUtils
+ // {
+ // public int findEnd(byte bytes[], int index, boolean includeTerminator)
+ // throws UnicodeException
+ // {
+ // for (int i = index; i < bytes.length; i++)
+ // {
+ // if (bytes[i] == 0)
+ // return includeTerminator ? i + 1 : i;
+ // }
+ // return bytes.length;
+ // // throw new UnicodeException("Terminator not found.");
+ // }
+ // }
+
+ private static class UnicodeMetricsUTF8 extends UnicodeUtils
+ {
+
+ public int findEnd(byte bytes[], int index, boolean includeTerminator)
+ throws UnicodeException
+ {
+ // http://en.wikipedia.org/wiki/UTF-8
+
+ while (true)
+ {
+ if (index == bytes.length)
+ return bytes.length;
+ if (index > bytes.length)
+ throw new UnicodeException("Terminator not found.");
+
+ int c1 = 0xff & bytes[index++];
+ if (c1 == 0)
+ return includeTerminator ? index : index - 1;
+ else if (c1 <= 0x7f)
+ continue;
+ else if (c1 <= 0xDF)
+ {
+ if (index >= bytes.length)
+ throw new UnicodeException("Invalid unicode.");
+
+ int c2 = 0xff & bytes[index++];
+ if (c2 < 0x80 || c2 > 0xBF)
+ throw new UnicodeException("Invalid code point.");
+ } else if (c1 <= 0xEF)
+ {
+ if (index >= bytes.length - 1)
+ throw new UnicodeException("Invalid unicode.");
+
+ int c2 = 0xff & bytes[index++];
+ if (c2 < 0x80 || c2 > 0xBF)
+ throw new UnicodeException("Invalid code point.");
+ int c3 = 0xff & bytes[index++];
+ if (c3 < 0x80 || c3 > 0xBF)
+ throw new UnicodeException("Invalid code point.");
+ } else if (c1 <= 0xF4)
+ {
+ if (index >= bytes.length - 2)
+ throw new UnicodeException("Invalid unicode.");
+
+ int c2 = 0xff & bytes[index++];
+ if (c2 < 0x80 || c2 > 0xBF)
+ throw new UnicodeException("Invalid code point.");
+ int c3 = 0xff & bytes[index++];
+ if (c3 < 0x80 || c3 > 0xBF)
+ throw new UnicodeException("Invalid code point.");
+ int c4 = 0xff & bytes[index++];
+ if (c4 < 0x80 || c4 > 0xBF)
+ throw new UnicodeException("Invalid code point.");
+ } else
+ throw new UnicodeException("Invalid code point.");
+ }
+ }
+ }
+
+ private abstract static class UnicodeMetricsUTF16 extends UnicodeUtils
+ {
+ protected static final int BYTE_ORDER_BIG_ENDIAN = 0;
+ protected static final int BYTE_ORDER_LITTLE_ENDIAN = 1;
+ protected int byteOrder = BYTE_ORDER_BIG_ENDIAN;
+
+ public UnicodeMetricsUTF16(int byteOrder)
+ {
+ this.byteOrder = byteOrder;
+ }
+
+ public boolean isValid(byte bytes[], int index,
+ boolean mayHaveTerminator, boolean mustHaveTerminator)
+ throws UnicodeException
+ {
+ // http://en.wikipedia.org/wiki/UTF-16/UCS-2
+
+ while (true)
+ {
+ if (index == bytes.length)
+ {
+ // end of buffer, no terminator found.
+ return !mustHaveTerminator;
+ }
+
+ if (index >= bytes.length - 1)
+ {
+ // end of odd-length buffer, no terminator found.
+ return false;
+ }
+
+ int c1 = 0xff & bytes[index++];
+ int c2 = 0xff & bytes[index++];
+ int msb1 = byteOrder == BYTE_ORDER_BIG_ENDIAN ? c1 : c2;
+
+ if (c1 == 0 && c2 == 0)
+ {
+ // terminator found.
+ return mayHaveTerminator;
+ }
+
+ if (msb1 >= 0xD8)
+ {
+ // Surrogate pair found.
+
+ if (msb1 >= 0xDC)
+ {
+ // invalid first surrogate.
+ return false;
+ }
+
+ if (index >= bytes.length - 1)
+ {
+ // missing second surrogate.
+ return false;
+ }
+
+ // second word.
+ int c3 = 0xff & bytes[index++];
+ int c4 = 0xff & bytes[index++];
+ int msb2 = byteOrder == BYTE_ORDER_BIG_ENDIAN ? c3 : c4;
+ if (msb2 < 0xDC)
+ {
+ // invalid second surrogate.
+ return false;
+ }
+ }
+ }
+ }
+
+ public int findEnd(byte bytes[], int index, boolean includeTerminator)
+ throws UnicodeException
+ {
+ // http://en.wikipedia.org/wiki/UTF-16/UCS-2
+
+ while (true)
+ {
+ if (index == bytes.length)
+ return bytes.length;
+ if (index > bytes.length - 1)
+ throw new UnicodeException("Terminator not found.");
+
+ int c1 = 0xff & bytes[index++];
+ int c2 = 0xff & bytes[index++];
+ int msb1 = byteOrder == BYTE_ORDER_BIG_ENDIAN ? c1 : c2;
+
+ if (c1 == 0 && c2 == 0)
+ {
+ return includeTerminator ? index : index - 2;
+ } else if (msb1 >= 0xD8)
+ {
+ if (index > bytes.length - 1)
+ throw new UnicodeException("Terminator not found.");
+
+ // second word.
+ int c3 = 0xff & bytes[index++];
+ int c4 = 0xff & bytes[index++];
+ int msb2 = byteOrder == BYTE_ORDER_BIG_ENDIAN ? c3 : c4;
+ if (msb2 < 0xDC)
+ throw new UnicodeException("Invalid code point.");
+ }
+ }
+ }
+ }
+
+ private static class UnicodeMetricsUTF16NoBOM extends UnicodeMetricsUTF16
+ {
+
+ public UnicodeMetricsUTF16NoBOM(final int byteOrder)
+ {
+ super(byteOrder);
+ }
+
+ }
+
+ private static class UnicodeMetricsUTF16WithBOM extends UnicodeMetricsUTF16
+ {
+
+ public UnicodeMetricsUTF16WithBOM()
+ {
+ super(BYTE_ORDER_BIG_ENDIAN);
+ }
+
+ public int findEnd(byte bytes[], int index, boolean includeTerminator)
+ throws UnicodeException
+ {
+ // http://en.wikipedia.org/wiki/UTF-16/UCS-2
+
+ if (index >= bytes.length - 1)
+ throw new UnicodeException("Missing BOM.");
+
+ int c1 = 0xff & bytes[index++];
+ int c2 = 0xff & bytes[index++];
+ if (c1 == 0xFF && c2 == 0xFE)
+ byteOrder = BYTE_ORDER_LITTLE_ENDIAN;
+ else if (c1 == 0xFE && c2 == 0xFF)
+ byteOrder = BYTE_ORDER_BIG_ENDIAN;
+ else
+ throw new UnicodeException("Invalid byte order mark.");
+
+ return super.findEnd(bytes, index, includeTerminator);
+ }
+ }
}