You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@xerces.apache.org by el...@apache.org on 2001/06/06 23:11:26 UTC
cvs commit: xml-xerces/java/src/org/apache/xerces/utils/regex Token.java
elena 01/06/06 14:11:26
Modified: java/src/org/apache/xerces/utils/regex Token.java
Log:
Added support to block names defined in Unicode 3.1 (XML Schema REC requires to support those)
Revision Changes Path
1.6 +136 -85 xml-xerces/java/src/org/apache/xerces/utils/regex/Token.java
Index: Token.java
===================================================================
RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/utils/regex/Token.java,v
retrieving revision 1.5
retrieving revision 1.6
diff -u -r1.5 -r1.6
--- Token.java 2001/05/17 20:11:12 1.5
+++ Token.java 2001/06/06 21:11:23 1.6
@@ -654,90 +654,125 @@
static final int CHAR_PUNCTUATION = 36;
static final int CHAR_SYMBOL = 37;
static final String[] blockNames = {
- "Basic Latin", // 0
- "Latin-1 Supplement",
- "Latin Extended-A",
- "Latin Extended-B",
- "IPA Extensions",
- "Spacing Modifier Letters",
- "Combining Diacritical Marks",
- "Greek",
- "Cyrillic", // 8
- "Armenian",
- "Hebrew",
- "Arabic",
- "Devanagari",
- "Bengali",
- "Gurmukhi",
- "Gujarati",
- "Oriya", // 16
- "Tamil",
- "Telugu",
- "Kannada",
- "Malayalam",
- "Thai",
- "Lao",
- "Tibetan",
- "Georgian", // 24
- "Hangul Jamo",
- "Latin Extended Additional",
- "Greek Extended",
- "General Punctuation",
- "Superscripts and Subscripts",
- "Currency Symbols",
- "Combining Marks for Symbols",
- "Letterlike Symbols", // 32
- "Number Forms",
- "Arrows",
- "Mathematical Operators",
- "Miscellaneous Technical",
- "Control Pictures",
- "Optical Character Recognition",
- "Enclosed Alphanumerics",
- "Box Drawing", // 40
- "Block Elements",
- "Geometric Shapes",
- "Miscellaneous Symbols",
- "Dingbats",
- "CJK Symbols and Punctuation",
- "Hiragana",
- "Katakana",
- "Bopomofo", // 48
- "Hangul Compatibility Jamo",
- "Kanbun",
- "Enclosed CJK Letters and Months",
- "CJK Compatibility",
- "CJK Unified Ideographs",
- "Hangul Syllables",
- "High Surrogates",
- "High Private Use Surrogates", // 56
- "Low Surrogates",
- "Private Use",
- "CJK Compatibility Ideographs",
- "Alphabetic Presentation Forms",
- "Arabic Presentation Forms-A",
- "Combining Half Marks",
- "CJK Compatibility Forms",
- "Small Form Variants", // 64
- "Arabic Presentation Forms-B",
- "Specials",
- "Halfwidth and Fullwidth Forms", // 67
+ /*0000..007F;*/ "Basic Latin",
+ /*0080..00FF;*/ "Latin-1 Supplement",
+ /*0100..017F;*/ "Latin Extended-A",
+ /*0180..024F;*/ "Latin Extended-B",
+ /*0250..02AF;*/ "IPA Extensions",
+ /*02B0..02FF;*/ "Spacing Modifier Letters",
+ /*0300..036F;*/ "Combining Diacritical Marks",
+ /*0370..03FF;*/ "Greek",
+ /*0400..04FF;*/ "Cyrillic",
+ /*0530..058F;*/ "Armenian",
+ /*0590..05FF;*/ "Hebrew",
+ /*0600..06FF;*/ "Arabic",
+ /*0700..074F;*/ "Syriac",
+ /*0780..07BF;*/ "Thaana",
+ /*0900..097F;*/ "Devanagari",
+ /*0980..09FF;*/ "Bengali",
+ /*0A00..0A7F;*/ "Gurmukhi",
+ /*0A80..0AFF;*/ "Gujarati",
+ /*0B00..0B7F;*/ "Oriya",
+ /*0B80..0BFF;*/ "Tamil",
+ /*0C00..0C7F;*/ "Telugu",
+ /*0C80..0CFF;*/ "Kannada",
+ /*0D00..0D7F;*/ "Malayalam",
+ /*0D80..0DFF;*/ "Sinhala",
+ /*0E00..0E7F;*/ "Thai",
+ /*0E80..0EFF;*/ "Lao",
+ /*0F00..0FFF;*/ "Tibetan",
+ /*1000..109F;*/ "Myanmar",
+ /*10A0..10FF;*/ "Georgian",
+ /*1100..11FF;*/ "Hangul Jamo",
+ /*1200..137F;*/ "Ethiopic",
+ /*13A0..13FF;*/ "Cherokee",
+ /*1400..167F;*/ "Unified Canadian Aboriginal Syllabics",
+ /*1680..169F;*/ "Ogham",
+ /*16A0..16FF;*/ "Runic",
+ /*1780..17FF;*/ "Khmer",
+ /*1800..18AF;*/ "Mongolian",
+ /*1E00..1EFF;*/ "Latin Extended Additional",
+ /*1F00..1FFF;*/ "Greek Extended",
+ /*2000..206F;*/ "General Punctuation",
+ /*2070..209F;*/ "Superscripts and Subscripts",
+ /*20A0..20CF;*/ "Currency Symbols",
+ /*20D0..20FF;*/ "Combining Marks for Symbols",
+ /*2100..214F;*/ "Letterlike Symbols",
+ /*2150..218F;*/ "Number Forms",
+ /*2190..21FF;*/ "Arrows",
+ /*2200..22FF;*/ "Mathematical Operators",
+ /*2300..23FF;*/ "Miscellaneous Technical",
+ /*2400..243F;*/ "Control Pictures",
+ /*2440..245F;*/ "Optical Character Recognition",
+ /*2460..24FF;*/ "Enclosed Alphanumerics",
+ /*2500..257F;*/ "Box Drawing",
+ /*2580..259F;*/ "Block Elements",
+ /*25A0..25FF;*/ "Geometric Shapes",
+ /*2600..26FF;*/ "Miscellaneous Symbols",
+ /*2700..27BF;*/ "Dingbats",
+ /*2800..28FF;*/ "Braille Patterns",
+ /*2E80..2EFF;*/ "CJK Radicals Supplement",
+ /*2F00..2FDF;*/ "Kangxi Radicals",
+ /*2FF0..2FFF;*/ "Ideographic Description Characters",
+ /*3000..303F;*/ "CJK Symbols and Punctuation",
+ /*3040..309F;*/ "Hiragana",
+ /*30A0..30FF;*/ "Katakana",
+ /*3100..312F;*/ "Bopomofo",
+ /*3130..318F;*/ "Hangul Compatibility Jamo",
+ /*3190..319F;*/ "Kanbun",
+ /*31A0..31BF;*/ "Bopomofo Extended",
+ /*3200..32FF;*/ "Enclosed CJK Letters and Months",
+ /*3300..33FF;*/ "CJK Compatibility",
+ /*3400..4DB5;*/ "CJK Unified Ideographs Extension A",
+ /*4E00..9FFF;*/ "CJK Unified Ideographs",
+ /*A000..A48F;*/ "Yi Syllables",
+ /*A490..A4CF;*/ "Yi Radicals",
+ /*AC00..D7A3;*/ "Hangul Syllables",
+ /*D800..DB7F;*/ "High Surrogates",
+ /*DB80..DBFF;*/ "High Private Use Surrogates",
+ /*DC00..DFFF;*/ "Low Surrogates",
+ /*E000..F8FF;*/ "Private Use",
+ /*F900..FAFF;*/ "CJK Compatibility Ideographs",
+ /*FB00..FB4F;*/ "Alphabetic Presentation Forms",
+ /*FB50..FDFF;*/ "Arabic Presentation Forms-A",
+ /*FE20..FE2F;*/ "Combining Half Marks",
+ /*FE30..FE4F;*/ "CJK Compatibility Forms",
+ /*FE50..FE6F;*/ "Small Form Variants",
+ /*FE70..FEFE;*/ "Arabic Presentation Forms-B",
+ /*FEFF..FEFF;*/ "Specials",
+ /*FF00..FFEF;*/ "Halfwidth and Fullwidth Forms",
+ //missing Specials add manually
+ /*10300..1032F;*/ "Old Italic",
+ /*10330..1034F;*/ "Gothic",
+ /*10400..1044F;*/ "Deseret",
+ /*1D000..1D0FF;*/ "Byzantine Musical Symbols",
+ /*1D100..1D1FF;*/ "Musical Symbols",
+ /*1D400..1D7FF;*/ "Mathematical Alphanumeric Symbols",
+ /*20000..2A6D6;*/ "CJK Unified Ideographs Extension B",
+ /*2F800..2FA1F;*/ "CJK Compatibility Ideographs Supplement",
+ /*E0000..E007F;*/ "Tags",
+ //missing 2 private use add manually
+
};
- static final String blockRanges =
- "\u0000\u007F\u0080\u00FF\u0100\u017F\u0180\u024F\u0250\u02AF\u02B0\u02FF"
- +"\u0300\u036F\u0370\u03FF\u0400\u04FF\u0530\u058F\u0590\u05FF\u0600\u06FF"
- +"\u0900\u097F\u0980\u09FF\u0A00\u0A7F\u0A80\u0AFF\u0B00\u0B7F\u0B80\u0BFF"
- +"\u0C00\u0C7F\u0C80\u0CFF\u0D00\u0D7F\u0E00\u0E7F\u0E80\u0EFF\u0F00\u0FBF"
- +"\u10A0\u10FF\u1100\u11FF\u1E00\u1EFF\u1F00\u1FFF\u2000\u206F\u2070\u209F"
- +"\u20A0\u20CF\u20D0\u20FF\u2100\u214F\u2150\u218F\u2190\u21FF\u2200\u22FF"
- +"\u2300\u23FF\u2400\u243F\u2440\u245F\u2460\u24FF\u2500\u257F\u2580\u259F"
- +"\u25A0\u25FF\u2600\u26FF\u2700\u27BF\u3000\u303F\u3040\u309F\u30A0\u30FF"
- +"\u3100\u312F\u3130\u318F\u3190\u319F\u3200\u32FF\u3300\u33FF\u4E00\u9FFF"
- +"\uAC00\uD7A3\uD800\uDB7F\uDB80\uDBFF\uDC00\uDFFF\uE000\uF8FF\uF900\uFAFF"
- +"\uFB00\uFB4F\uFB50\uFDFF\uFE20\uFE2F\uFE30\uFE4F\uFE50\uFE6F\uFE70\uFEFE"
- +"\uFEFF\uFEFF\uFF00\uFFEF";
+ //ADD THOSE MANUALLY
+ //F0000..FFFFD; "Private Use",
+ //100000..10FFFD; "Private Use"
+ //FFF0..FFFD; "Specials",
+ static final String blockRanges =
+ "\u0000\u007F\u0080\u00FF\u0100\u017F\u0180\u024F\u0250\u02AF\u02B0\u02FF\u0300\u036F"
+ +"\u0370\u03FF\u0400\u04FF\u0530\u058F\u0590\u05FF\u0600\u06FF\u0700\u074F\u0780\u07BF"
+ +"\u0900\u097F\u0980\u09FF\u0A00\u0A7F\u0A80\u0AFF\u0B00\u0B7F\u0B80\u0BFF\u0C00\u0C7F\u0C80\u0CFF"
+ +"\u0D00\u0D7F\u0D80\u0DFF\u0E00\u0E7F\u0E80\u0EFF\u0F00\u0FFF\u1000\u109F\u10A0\u10FF\u1100\u11FF"
+ +"\u1200\u137F\u13A0\u13FF\u1400\u167F\u1680\u169F\u16A0\u16FF\u1780\u17FF\u1800\u18AF\u1E00\u1EFF"
+ +"\u1F00\u1FFF\u2000\u206F\u2070\u209F\u20A0\u20CF\u20D0\u20FF\u2100\u214F\u2150\u218F\u2190\u21FF\u2200\u22FF"
+ +"\u2300\u23FF\u2400\u243F\u2440\u245F\u2460\u24FF\u2500\u257F\u2580\u259F\u25A0\u25FF\u2600\u26FF\u2700\u27BF"
+ +"\u2800\u28FF\u2E80\u2EFF\u2F00\u2FDF\u2FF0\u2FFF\u3000\u303F\u3040\u309F\u30A0\u30FF\u3100\u312F\u3130\u318F"
+ +"\u3190\u319F\u31A0\u31BF\u3200\u32FF\u3300\u33FF\u3400\u4DB5\u4E00\u9FFF\uA000\uA48F\uA490\uA4CF"
+ +"\uAC00\uD7A3\uD800\uDB7F\uDB80\uDBFF\uDC00\uDFFF\uE000\uF8FF\uF900\uFAFF\uFB00\uFB4F\uFB50\uFDFF"
+ +"\uFE20\uFE2F\uFE30\uFE4F\uFE50\uFE6F\uFE70\uFEFE\uFEFF\uFEFF\uFF00\uFFEF\u10300\u1032F\u10330\u1034F"
+ +"\u10400\u1044F\u1D000\u1D0FFs\u1D100\u1D1FF\u1D400\u1D7FF\u20000\u2A6D6\u2F800\u2FA1F\uE0000\uE007F";
- static protected RangeToken getRange(String name, boolean positive) {
+ static protected RangeToken getRange(String name, boolean positive) {
if (Token.categories.size() == 0) {
synchronized (Token.categories) {
Token[] ranges = new Token[Token.categoryNames.length];
@@ -823,23 +858,38 @@
Token.complementRanges(ranges[i]));
}
}
+ //REVISIT: is anybody other than Xerces refering to block name by their name?
+ // or we can just create all the names in IsBLOCKNAME format?
+ //
+ StringBuffer buffer = new StringBuffer(50);
for (int i = 0; i < Token.blockNames.length; i ++) {
Token r1 = Token.createRange();
+
int rstart = Token.blockRanges.charAt(i*2);
int rend = Token.blockRanges.charAt(i*2+1);
String n = Token.blockNames[i];
+ //DEBUGING
+ //System.out.println(n+" " +Integer.toHexString(rstart)
+ // +"-"+ Integer.toHexString(rend));
r1.addRange(rstart, rend);
if (n.equals("Specials"))
r1.addRange(0xfff0, 0xfffd);
+ if (n.equals("Private Use")) {
+ r1.addRange(0xF0000,0xFFFFD);
+ r1.addRange(0x100000,0x10FFFD);
+ }
Token.categories.put(n, r1);
Token.categories2.put(n, Token.complementRanges(r1));
+ buffer.setLength(0);
+ buffer.append("Is");
if (n.indexOf(' ') >= 0) {
- StringBuffer buffer = new StringBuffer(n.length()+2);
- buffer.append("Is");
for (int ci = 0; ci < n.length(); ci ++)
if (n.charAt(ci) != ' ') buffer.append((char)n.charAt(ci));
- Token.setAlias(new String(buffer), n, true);
}
+ else {
+ buffer.append(n);
+ }
+ Token.setAlias(buffer.toString(), n, true);
}
// REVISIT: remove this code later
@@ -925,6 +975,7 @@
} // if null
RangeToken tok = positive ? (RangeToken)Token.categories.get(name)
: (RangeToken)Token.categories2.get(name);
+ if (tok == null) System.out.println(name);
return tok;
}
---------------------------------------------------------------------
To unsubscribe, e-mail: xerces-cvs-unsubscribe@xml.apache.org
For additional commands, e-mail: xerces-cvs-help@xml.apache.org