You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@xerces.apache.org by je...@locus.apache.org on 2000/06/21 01:57:26 UTC
cvs commit: xml-xerces/java/src/org/apache/xerces/utils/regex BMPattern.java Match.java ParserForXMLSchema.java RangeToken.java RegexParser.java RegularExpression.java REUtil.java Token.java
jeffreyr 00/06/20 16:57:26
Modified: java/src/org/apache/xerces/utils/regex BMPattern.java
Match.java ParserForXMLSchema.java RangeToken.java
RegexParser.java RegularExpression.java REUtil.java
Token.java
Log:
It enables character class subtraction like [a-z-[c]] and fixes some bugs. Kento Tamura fix
Revision Changes Path
1.2 +3 -3 xml-xerces/java/src/org/apache/xerces/utils/regex/BMPattern.java
Index: BMPattern.java
===================================================================
RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/utils/regex/BMPattern.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- BMPattern.java 2000/04/04 19:31:31 1.1
+++ BMPattern.java 2000/06/20 23:57:18 1.2
@@ -101,7 +101,7 @@
/**
*
- * @return -1 if iterator does not contain this pattern.
+ * @return -1 if <var>iterator</var> does not contain this pattern.
*/
public int matches(CharacterIterator iterator, int start, int limit) {
if (this.ignoreCase) return this.matchesIgnoreCase(iterator, start, limit);
@@ -126,7 +126,7 @@
/**
*
- * @return -1 if str does not contain this pattern.
+ * @return -1 if <var>str</var> does not contain this pattern.
*/
public int matches(String str, int start, int limit) {
if (this.ignoreCase) return this.matchesIgnoreCase(str, start, limit);
@@ -151,7 +151,7 @@
}
/**
*
- * @return -1 if str does not contain this pattern.
+ * @return -1 if <var>chars</char> does not contain this pattern.
*/
public int matches(char[] chars, int start, int limit) {
if (this.ignoreCase) return this.matchesIgnoreCase(chars, start, limit);
1.2 +36 -4 xml-xerces/java/src/org/apache/xerces/utils/regex/Match.java
Index: Match.java
===================================================================
RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/utils/regex/Match.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- Match.java 2000/04/04 19:31:31 1.1
+++ Match.java 2000/06/20 23:57:18 1.2
@@ -62,6 +62,13 @@
/**
*
+ * An instance of this class has ranges captured in matching.
+ *
+ * @see org.apache.xerces.utils.regex.RegularExpression#matches(char[], int, int, org.apache.xerces.utils.regex.Match)
+ * @see org.apache.xerces.utils.regex.RegularExpression#matches(char[], org.apache.xerces.utils.regex.Match)
+ * @see org.apache.xerces.utils.regex.RegularExpression#matches(java.text.CharacterIterator, org.apache.xerces.utils.regex.Match)
+ * @see org.apache.xerces.utils.regex.RegularExpression#matches(java.lang.String, int, int, org.apache.xerces.utils.regex.Match)
+ * @see org.apache.xerces.utils.regex.RegularExpression#matches(java.lang.String, org.apache.xerces.utils.regex.Match)
* @author TAMURA Kent <kent@trl.ibm.co.jp>
*/
public class Match implements Cloneable {
@@ -73,9 +80,15 @@
String strSource = null;
char[] charSource = null;
+ /**
+ * Creates an instance.
+ */
public Match() {
}
+ /**
+ *
+ */
public synchronized Object clone() {
Match ma = new Match();
if (this.nofgroups > 0) {
@@ -90,6 +103,9 @@
return ma;
}
+ /**
+ *
+ */
protected void setNumberOfGroups(int n) {
int oldn = this.nofgroups;
this.nofgroups = n;
@@ -104,26 +120,41 @@
}
}
+ /**
+ *
+ */
protected void setSource(CharacterIterator ci) {
this.ciSource = ci;
this.strSource = null;
this.charSource = null;
}
+ /**
+ *
+ */
protected void setSource(String str) {
this.ciSource = null;
this.strSource = str;
this.charSource = null;
}
+ /**
+ *
+ */
protected void setSource(char[] chars) {
this.ciSource = null;
this.strSource = null;
this.charSource = chars;
}
+ /**
+ *
+ */
protected void setBeginning(int index, int v) {
this.beginpos[index] = v;
}
+ /**
+ *
+ */
protected void setEnd(int index, int v) {
this.endpos[index] = v;
}
@@ -178,13 +209,14 @@
throw new IllegalArgumentException("The parameter must be less than "
+this.nofgroups+": "+index);
String ret;
+ int begin = this.beginpos[index], end = this.endpos[index];
+ if (begin < 0 || end < 0) return null;
if (this.ciSource != null) {
- ret = REUtil.substring(this.ciSource, this.beginpos[index], this.endpos[index]);
+ ret = REUtil.substring(this.ciSource, begin, end);
} else if (this.strSource != null) {
- ret = this.strSource.substring(this.beginpos[index], this.endpos[index]);
+ ret = this.strSource.substring(begin, end);
} else {
- int begin = this.beginpos[index];
- ret = new String(this.charSource, begin, this.endpos[index]-begin);
+ ret = new String(this.charSource, begin, end-begin);
}
return ret;
}
1.3 +143 -8 xml-xerces/java/src/org/apache/xerces/utils/regex/ParserForXMLSchema.java
Index: ParserForXMLSchema.java
===================================================================
RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/utils/regex/ParserForXMLSchema.java,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- ParserForXMLSchema.java 2000/06/02 23:08:32 1.2
+++ ParserForXMLSchema.java 2000/06/20 23:57:18 1.3
@@ -62,14 +62,10 @@
import java.util.Hashtable;
import java.util.Locale;
-/*
- * TODO:
- * Grammar of character classes
- * Shorthands in character classes
- */
-
/**
+ * A regular expression parser for the XML Shema.
*
+ * @author TAMURA Kent <kent@trl.ibm.co.jp>
*/
class ParserForXMLSchema extends RegexParser {
@@ -176,11 +172,14 @@
this.next();
return this.getTokenForShorthand('I');
}
+ Token processBacksolidus_g() throws ParseException {
+ throw this.ex("parser.process.1", this.offset-2);
+ }
Token processBacksolidus_X() throws ParseException {
- throw ex("parser.process.1", this.offset);
+ throw ex("parser.process.1", this.offset-2);
}
Token processBackreference() throws ParseException {
- throw ex("parser.process.1", this.offset);
+ throw ex("parser.process.1", this.offset-4);
}
int processCIinCharacterClass(RangeToken tok, int c) {
@@ -189,6 +188,142 @@
}
+ /**
+ * Parses a character-class-expression, not a character-class-escape.
+ *
+ * c-c-expression ::= '[' c-group ']'
+ * c-group ::= positive-c-group | negative-c-group | c-c-subtraction
+ * positive-c-group ::= (c-range | c-c-escape)+
+ * negative-c-group ::= '^' positive-c-group
+ * c-c-subtraction ::= (positive-c-group | negative-c-group) subtraction
+ * subtraction ::= '-' c-c-expression
+ * c-range ::= single-range | from-to-range
+ * single-range ::= multi-c-escape | category-c-escape | block-c-escape | <any XML char>
+ * cc-normal-c ::= <any character except [, ], \>
+ * from-to-range ::= cc-normal-c '-' cc-normal-c
+ *
+ * @param useNrage Ignored.
+ * @return This returns no NrageToken.
+ */
+ protected RangeToken parseCharacterClass(boolean useNrange) throws ParseException {
+ this.setContext(S_INBRACKETS);
+ this.next(); // '['
+ boolean nrange = false;
+ RangeToken base = null;
+ RangeToken tok;
+ if (this.read() == T_CHAR && this.chardata == '^') {
+ nrange = true;
+ this.next(); // '^'
+ base = Token.createRange();
+ base.addRange(0, Token.UTF16_MAX);
+ tok = Token.createRange();
+ } else {
+ tok = Token.createRange();
+ }
+ int type;
+ boolean firstloop = true;
+ while ((type = this.read()) != T_EOF) { // Don't use 'cotinue' for this loop.
+ // single-range | from-to-range | subtraction
+ if (type == T_CHAR && this.chardata == ']' && !firstloop) {
+ if (nrange) {
+ base.subtractRanges(tok);
+ tok = base;
+ }
+ break;
+ }
+ int c = this.chardata;
+ boolean end = false;
+ if (type == T_BACKSOLIDUS) {
+ switch (c) {
+ case 'd': case 'D':
+ case 'w': case 'W':
+ case 's': case 'S':
+ tok.mergeRanges(this.getTokenForShorthand(c));
+ end = true;
+ break;
+
+ case 'i': case 'I':
+ case 'c': case 'C':
+ c = this.processCIinCharacterClass(tok, c);
+ if (c < 0) end = true;
+ break;
+
+ case 'p':
+ case 'P':
+ int pstart = this.offset;
+ RangeToken tok2 = this.processBacksolidus_pP(c);
+ if (tok2 == null) throw this.ex("parser.atom.5", pstart);
+ tok.mergeRanges(tok2);
+ end = true;
+ break;
+
+ default:
+ c = this.decodeEscaped();
+ } // \ + c
+ } // backsolidus
+ else if (type == T_XMLSCHEMA_CC_SUBTRACTION && !firstloop) {
+ // Subraction
+ if (nrange) {
+ base.subtractRanges(tok);
+ tok = base;
+ }
+ RangeToken range2 = this.parseCharacterClass(false);
+ tok.subtractRanges(range2);
+ if (this.read() != T_CHAR || this.chardata != ']')
+ throw this.ex("parser.cc.5", this.offset);
+ break; // Exit this loop
+ }
+ this.next();
+ if (!end) { // if not shorthands...
+ if (type == T_CHAR) {
+ if (c == '[') throw this.ex("parser.cc.6", this.offset-2);
+ if (c == ']') throw this.ex("parser.cc.7", this.offset-2);
+ }
+ if (this.read() != T_CHAR || this.chardata != '-') { // Here is no '-'.
+ tok.addRange(c, c);
+ } else { // Found '-'
+ // Is this '-' is a from-to token??
+ this.next(); // Skips '-'
+ if ((type = this.read()) == T_EOF) throw this.ex("parser.cc.2", this.offset);
+ // c '-' ']' -> '-' is a single-range.
+ if (type == T_CHAR && this.chardata == ']') {
+ tok.addRange(c, c);
+ tok.addRange('-', '-');
+ }
+ // c '-' '-[' -> '-' is a single-range.
+ else if (type == T_XMLSCHEMA_CC_SUBTRACTION) {
+ tok.addRange(c, c);
+ tok.addRange('-', '-');
+ } else {
+ int rangeend = this.chardata;
+ if (type == T_CHAR) {
+ if (rangeend == '[') throw this.ex("parser.cc.6", this.offset-1);
+ if (rangeend == ']') throw this.ex("parser.cc.7", this.offset-1);
+ }
+ if (type == T_BACKSOLIDUS)
+ rangeend = this.decodeEscaped();
+ this.next();
+ tok.addRange(c, rangeend);
+ }
+ }
+ }
+ firstloop = false;
+ }
+ if (this.read() == T_EOF)
+ throw this.ex("parser.cc.2", this.offset);
+ tok.sortRanges();
+ tok.compactRanges();
+ //tok.dumpRanges();
+ this.setContext(S_NORMAL);
+ this.next(); // Skips ']'
+
+ return tok;
+ }
+
+ protected RangeToken parseSetOperations() throws ParseException {
+ throw this.ex("parser.process.1", this.offset);
+ }
+
Token getTokenForShorthand(int ch) {
switch (ch) {
case 'd':
1.2 +8 -9 xml-xerces/java/src/org/apache/xerces/utils/regex/RangeToken.java
Index: RangeToken.java
===================================================================
RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/utils/regex/RangeToken.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- RangeToken.java 2000/04/04 19:31:31 1.1
+++ RangeToken.java 2000/06/20 23:57:18 1.2
@@ -158,11 +158,12 @@
*/
protected void compactRanges() {
boolean DEBUG = false;
- if (this.ranges == null || this.ranges.length == 2)
+ if (this.ranges == null || this.ranges.length <= 2)
return;
if (this.isCompacted())
return;
- int base = 0, target = 0;
+ int base = 0; // Index of writing point
+ int target = 0; // Index of processing point
while (target < this.ranges.length) {
if (base != target) {
@@ -170,8 +171,6 @@
this.ranges[base+1] = this.ranges[target++];
} else
target += 2;
- if (target >= this.ranges.length)
- break;
int baseend = this.ranges[base+1];
while (target < this.ranges.length) {
if (baseend+1 < this.ranges[target])
@@ -217,10 +216,10 @@
+"] ["+this.ranges[target]
+","+this.ranges[target+1]+"]");
}
- }
+ } // while
base += 2;
}
- base += 2;
+
if (base != this.ranges.length) {
int[] result = new int[base];
System.arraycopy(this.ranges, 0, result, 0, base);
@@ -570,7 +569,7 @@
//for (int i = 0; i < asize; i ++) System.err.println("Map: "+Integer.toString(this.map[i], 16));
}
- public String toString() {
+ public String toString(int options) {
String ret;
if (this.type == RANGE) {
if (this == Token.token_dot)
@@ -585,7 +584,7 @@
StringBuffer sb = new StringBuffer();
sb.append("[");
for (int i = 0; i < this.ranges.length; i += 2) {
- if (i > 0) sb.append(",");
+ if ((options & RegularExpression.SPECIAL_COMMA) != 0 && i > 0) sb.append(",");
if (this.ranges[i] == this.ranges[i+1]) {
sb.append(escapeCharInCharClass(this.ranges[i]));
} else {
@@ -608,7 +607,7 @@
StringBuffer sb = new StringBuffer();
sb.append("[^");
for (int i = 0; i < this.ranges.length; i += 2) {
- if (i > 0) sb.append(",");
+ if ((options & RegularExpression.SPECIAL_COMMA) != 0 && i > 0) sb.append(",");
if (this.ranges[i] == this.ranges[i+1]) {
sb.append(escapeCharInCharClass(this.ranges[i]));
} else {
1.3 +185 -46 xml-xerces/java/src/org/apache/xerces/utils/regex/RegexParser.java
Index: RegexParser.java
===================================================================
RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/utils/regex/RegexParser.java,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- RegexParser.java 2000/05/17 18:32:55 1.2
+++ RegexParser.java 2000/06/20 23:57:18 1.3
@@ -87,10 +87,11 @@
static final int T_NEGATIVELOOKBEHIND = 17; // '(?<!'
static final int T_INDEPENDENT = 18; // '(?>'
static final int T_SET_OPERATIONS = 19; // '(?['
- static final int T_POSIX_CHARCLASS_START = 20; // '[:'
+ static final int T_POSIX_CHARCLASS_START = 20; // '[:' in a character class
static final int T_COMMENT = 21; // '(?#'
static final int T_MODIFIERS = 22; // '(?' [\-,a-z,A-Z]
static final int T_CONDITION = 23; // '(?('
+ static final int T_XMLSCHEMA_CC_SUBTRACTION = 24; // '-[' in a character class
static class ReferencePosition {
int refNumber;
@@ -108,8 +109,9 @@
ResourceBundle resources;
int chardata;
int nexttoken;
- static private final int S_NORMAL = 0;
- static private final int S_INBRACKETS = 1;
+ static protected final int S_NORMAL = 0;
+ static protected final int S_INBRACKETS = 1;
+ static protected final int S_INXBRACKETS = 2;
int context = S_NORMAL;
int parennumber = 1;
boolean hasBackReferences;
@@ -166,12 +168,14 @@
return ret;
}
+ /*
public RegularExpression createRegex(String regex, int options) throws ParseException {
Token tok = this.parse(regex, options);
return new RegularExpression(regex, tok, this.parennumber, this.hasBackReferences, options);
}
+ */
- private final void setContext(int con) {
+ protected final void setContext(int con) {
this.context = con;
}
@@ -201,8 +205,18 @@
this.chardata = this.regex.charAt(this.offset++);
break;
+ case '-':
+ if (this.isSet(RegularExpression.XMLSCHEMA_MODE)
+ && this.offset < this.regexlen && this.regex.charAt(this.offset) == '[') {
+ this.offset++;
+ ret = T_XMLSCHEMA_CC_SUBTRACTION;
+ } else
+ ret = T_CHAR;
+ break;
+
case '[':
- if (this.offset < this.regexlen && this.regex.charAt(this.offset) == ':') {
+ if (!this.isSet(RegularExpression.XMLSCHEMA_MODE)
+ && this.offset < this.regexlen && this.regex.charAt(this.offset) == ':') {
this.offset++;
ret = T_POSIX_CHARCLASS_START;
break;
@@ -567,10 +581,14 @@
Token processBacksolidus_I() throws ParseException {
throw ex("parser.process.1", this.offset);
}
- Token processBacksolidus_X() throws ParseException {
+ Token processBacksolidus_g() throws ParseException {
this.next();
return Token.getGraphemePattern();
}
+ Token processBacksolidus_X() throws ParseException {
+ this.next();
+ return Token.getCombiningCharacterSequence();
+ }
Token processBackreference() throws ParseException {
int refnum = this.chardata-'0';
Token tok = Token.createBackReference(refnum);
@@ -737,6 +755,7 @@
case 'C': return this.processBacksolidus_C();
case 'i': return this.processBacksolidus_i();
case 'I': return this.processBacksolidus_I();
+ case 'g': return this.processBacksolidus_g();
case 'X': return this.processBacksolidus_X();
case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
@@ -744,43 +763,9 @@
case 'P':
case 'p':
- boolean positive = this.chardata == 'p';
int pstart = this.offset;
- this.next();
- if (this.read() != T_CHAR) throw ex("parser.atom.2", this.offset-1);
- switch (this.chardata) {
- case 'L': // Letter
- tok = Token.getRange("L", positive); break;
- case 'M': // Mark
- tok = Token.getRange("M", positive); break;
- case 'N': // Number
- tok = Token.getRange("N", positive); break;
- case 'Z': // Separator
- tok = Token.getRange("Z", positive); break;
- case 'C': // Other
- tok = Token.getRange("C", positive); break;
- case 'P': // Punctuation
- tok = Token.getRange("P", positive); break;
- case 'S': // Symbol
- tok = Token.getRange("S", positive); break;
- case '{':
- // this.offset points the next of '{'.
- pstart = this.offset;
- int namestart = this.offset;
- int nameend = this.regex.indexOf('}', namestart);
- if (nameend < 0) throw ex("parser.atom.3", this.offset);
- this.offset = nameend+1;
- tok = Token.getRange(this.regex.substring(namestart, nameend), positive);
- /*
- if (this.isSet(RegularExpression.IGNORE_CASE))
- tok = RangeToken.createCaseInsensitiveToken(tok);
- */
- break;
-
- default:
- throw ex("parser.atom.2", this.offset-1);
- }
- if (tok == null) throw ex("parser.atom.5", pstart);
+ tok = processBacksolidus_pP(this.chardata);
+ if (tok == null) throw this.ex("parser.atom.5", pstart);
break;
default:
@@ -794,8 +779,48 @@
this.next();
break;
+ default:
+ throw this.ex("parser.atom.4", this.offset-1);
+ }
+ return tok;
+ }
+
+ protected RangeToken processBacksolidus_pP(int c) throws ParseException {
+ boolean positive = c == 'p';
+ this.next();
+ if (this.read() != T_CHAR) throw this.ex("parser.atom.2", this.offset-1);
+ RangeToken tok;
+ switch (this.chardata) {
+ case 'L': // Letter
+ tok = Token.getRange("L", positive); break;
+ case 'M': // Mark
+ tok = Token.getRange("M", positive); break;
+ case 'N': // Number
+ tok = Token.getRange("N", positive); break;
+ case 'Z': // Separator
+ tok = Token.getRange("Z", positive); break;
+ case 'C': // Other
+ tok = Token.getRange("C", positive); break;
+ case 'P': // Punctuation
+ tok = Token.getRange("P", positive); break;
+ case 'S': // Symbol
+ tok = Token.getRange("S", positive); break;
+ case '{':
+ // this.offset points the next of '{'.
+ //pstart = this.offset;
+ int namestart = this.offset;
+ int nameend = this.regex.indexOf('}', namestart);
+ if (nameend < 0) throw this.ex("parser.atom.3", this.offset);
+ this.offset = nameend+1;
+ tok = Token.getRange(this.regex.substring(namestart, nameend), positive);
+ /*
+ if (this.isSet(RegularExpression.IGNORE_CASE))
+ tok = RangeToken.createCaseInsensitiveToken(tok);
+ */
+ break;
+
default:
- throw ex("parser.atom.4", this.offset-1);
+ throw this.ex("parser.atom.2", this.offset-1);
}
return tok;
}
@@ -811,7 +836,121 @@
* range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | bmp-code | character-2
* bmp-code ::= '\' 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]
*/
- private RangeToken parseCharacterClass(boolean useNrange) throws ParseException {
+ protected RangeToken parseCharacterClass(boolean useNrange) throws ParseException {
+ this.setContext(S_INBRACKETS);
+ this.next(); // '['
+ boolean nrange = false;
+ RangeToken base = null;
+ RangeToken tok;
+ if (this.read() == T_CHAR && this.chardata == '^') {
+ nrange = true;
+ this.next(); // '^'
+ if (useNrange) {
+ tok = Token.createNRange();
+ } else {
+ base = Token.createRange();
+ base.addRange(0, Token.UTF16_MAX);
+ tok = Token.createRange();
+ }
+ } else {
+ tok = Token.createRange();
+ }
+ int type;
+ boolean firstloop = true;
+ while ((type = this.read()) != T_EOF) {
+ if (type == T_CHAR && this.chardata == ']' && !firstloop)
+ break;
+ firstloop = false;
+ int c = this.chardata;
+ boolean end = false;
+ if (type == T_BACKSOLIDUS) {
+ switch (c) {
+ case 'd': case 'D':
+ case 'w': case 'W':
+ case 's': case 'S':
+ tok.mergeRanges(this.getTokenForShorthand(c));
+ end = true;
+ break;
+
+ case 'i': case 'I':
+ case 'c': case 'C':
+ c = this.processCIinCharacterClass(tok, c);
+ if (c < 0) end = true;
+ break;
+
+ case 'p':
+ case 'P':
+ int pstart = this.offset;
+ RangeToken tok2 = this.processBacksolidus_pP(c);
+ if (tok2 == null) throw this.ex("parser.atom.5", pstart);
+ tok.mergeRanges(tok2);
+ end = true;
+ break;
+
+ default:
+ c = this.decodeEscaped();
+ } // \ + c
+ } // backsolidus
+ // POSIX Character class such as [:alnum:]
+ else if (type == T_POSIX_CHARCLASS_START) {
+ int nameend = this.regex.indexOf(':', this.offset);
+ if (nameend < 0) throw this.ex("parser.cc.1", this.offset);
+ boolean positive = true;
+ if (this.regex.charAt(this.offset) == '^') {
+ this.offset ++;
+ positive = false;
+ }
+ String name = this.regex.substring(this.offset, nameend);
+ RangeToken range = Token.getRange(name, positive);
+ if (range == null) throw this.ex("parser.cc.3", this.offset);
+ tok.mergeRanges(range);
+ end = true;
+ if (nameend+1 >= this.regexlen || this.regex.charAt(nameend+1) != ']')
+ throw this.ex("parser.cc.1", nameend);
+ this.offset = nameend+2;
+ }
+ this.next();
+ if (!end) { // if not shorthands...
+ if (this.read() != T_CHAR || this.chardata != '-') { // Here is no '-'.
+ tok.addRange(c, c);
+ } else {
+ this.next(); // Skips '-'
+ if ((type = this.read()) == T_EOF) throw this.ex("parser.cc.2", this.offset);
+ if (type == T_CHAR && this.chardata == ']') {
+ tok.addRange(c, c);
+ tok.addRange('-', '-');
+ } else {
+ int rangeend = this.chardata;
+ if (type == T_BACKSOLIDUS)
+ rangeend = this.decodeEscaped();
+ this.next();
+ tok.addRange(c, rangeend);
+ }
+ }
+ }
+ if (this.isSet(RegularExpression.SPECIAL_COMMA)
+ && this.read() == T_CHAR && this.chardata == ',')
+ this.next();
+ }
+ if (this.read() == T_EOF)
+ throw this.ex("parser.cc.2", this.offset);
+ if (!useNrange && nrange) {
+ base.subtractRanges(tok);
+ tok = base;
+ }
+ tok.sortRanges();
+ tok.compactRanges();
+ //tok.dumpRanges();
+ /*
+ if (this.isSet(RegularExpression.IGNORE_CASE))
+ tok = RangeToken.createCaseInsensitiveToken(tok);
+ */
+ this.setContext(S_NORMAL);
+ this.next(); // Skips ']'
+
+ return tok;
+ }
+ private RangeToken parseCharacterClass_old(boolean useNrange) throws ParseException {
this.setContext(S_INBRACKETS);
this.next(); // '['
boolean nrange = false;
@@ -824,7 +963,7 @@
tok = Token.createNRange();
} else {
base = Token.createRange();
- base.addRange(0, 0xffff);
+ base.addRange(0, Token.UTF16_MAX);
tok = Token.createRange();
}
} else {
@@ -962,7 +1101,7 @@
/**
* '(?[' ... ']' (('-' | '+' | '&') '[' ... ']')? ')'
*/
- private RangeToken parseSetOperations() throws ParseException {
+ protected RangeToken parseSetOperations() throws ParseException {
RangeToken tok = this.parseCharacterClass(false);
int type;
while ((type = this.read()) != T_RPAREN) {
1.3 +157 -125 xml-xerces/java/src/org/apache/xerces/utils/regex/RegularExpression.java
Index: RegularExpression.java
===================================================================
RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/utils/regex/RegularExpression.java,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- RegularExpression.java 2000/05/17 18:32:55 1.2
+++ RegularExpression.java 2000/06/20 23:57:18 1.3
@@ -85,13 +85,6 @@
* }
* </pre>
*
- *<!--
- * <dt>C. Easy way
- * <pre>
- * if (RegularExpression.matches(<var>regex</var>, text) >= 0) { ... }
- * </pre>
- *-->
- *
* </dl>
*
* <h4>Case-insensitive matching</h4>
@@ -119,16 +112,16 @@
* 'Unicode Regular Expression Guidelines' Revision 4.
* When "w" and "u" are specified at the same time,
* <kbd>\b \B \< \></kbd> are processed for the "w" option.
+ * <dt><a name="COMMA_OPTION"><code>","</code></a>
+ * <dd>The parser treats a comma in a character class as a range separator.
+ * <kbd class="REGEX">[a,b]</kbd> matches <kbd>a</kbd> or <kbd>,</kbd> or <kbd>b</kbd> without this option.
+ * <kbd class="REGEX">[a,b]</kbd> matches <kbd>a</kbd> or <kbd>b</kbd> with this option.
*
* <dt><a name="X_OPTION"><code>"X"</code></a>
- * <dd class="REGEX"><!--<a href="http://www.w3.org/TR/xmlschema-2/#regexs">XML Schema</a> mode.-->
- * By this option, the engine confoms to <a href="http://www.w3.org/TR/1999/WD-xmlschema-2-19991217/#regexs">XML Schema: Regular Expression</a>.
+ * <dd class="REGEX">
+ * By this option, the engine confoms to <a href="http://www.w3.org/TR/2000/WD-xmlschema-2-20000407/#regexs">XML Schema: Regular Expression</a>.
* The <code>match()</code> method does not do subsring matching
* but entire string matching.
- * <dl>
- * <dt>NOT IMPLEMNTED FEATURES:
- * <dd>Character class subtraction
- * </dl>
*
* </dl>
*
@@ -139,15 +132,13 @@
* <td>
* <h4>Differences from the Perl 5 regular expression</h4>
* <ul>
- * <li><kbd>,</kbd> is a special character in <kbd>[]</kbd>.
* <li>There is 6-digit hexadecimal character representation (<kbd>\u005cv</kbd><var>HHHHHH</var>.)
- * <li><kbd>\X</kbd> has different meaning.
* <li>Supports subtraction, union, and intersection operations for character classes.
* <li>Not supported: <kbd>\</kbd><var>ooo</var> (Octal character representations),
* <Kbd>\G</kbd>, <kbd>\C</kbd>, <kbd>\l</kbd><var>c</var>,
* <kbd>\u005cu</kbd><var>c</var>, <kbd>\L</kbd>, <kbd>\U</kbd>,
- * <kbd>\E</kbd>, <kbd>\Q</kbd>,
- * <Kbd>(?{<kbd><var>code</var><kbd>})</kbd>, <Kbd>(?p{<kbd><var>code</var><kbd>})</kbd>
+ * <kbd>\E</kbd>, <kbd>\Q</kbd>, <kbd>\N{</kbd><var>name</var><kbd>}</kbd>,
+ * <Kbd>(?{<kbd><var>code</var><kbd>})</kbd>, <Kbd>(??{<kbd><var>code</var><kbd>})</kbd>
* </ul>
* </td>
* </tr>
@@ -197,16 +188,20 @@
* <dt class="REGEX"><kbd>\u005cv</kbd><var>HHHHHH</var>
* <dd>Matches a character of which code point is <var>HHHHHH</var> (Hexadecimal) in Unicode.
*
- * <dt class="REGEX"><kbd>\X</kbd>
+ * <dt class="REGEX"><kbd>\g</kbd>
* <dd>Matches a grapheme.
- * <dd class="REGEX">It is equivalent to <kbd>(?[\p{ASSIGNED}]-[\p{M},\p{C}])?(?:\p{M}|[\x{094D},\x{09CD},\x{0A4D},\x{0ACD},\x{0B3D},\x{0BCD},\x{0C4D},\x{0CCD},\x{0D4D},\x{0E3A},\x{0F84}]\p{L}|[\x{1160}-\x{11A7}]|[\x{11A8}-\x{11FF}]|[\x{FF9E},\x{FF9F}])*</kbd>
- * <dd class="WARNING"><Kbd>\X</kbd> in Perl 5.6 means <kbd>\P{M}\p{M}*</kbd>.</dd>
+ * <dd class="REGEX">It is equivalent to <kbd>(?[\p{ASSIGNED}]-[\p{M}\p{C}])?(?:\p{M}|[\x{094D}\x{09CD}\x{0A4D}\x{0ACD}\x{0B3D}\x{0BCD}\x{0C4D}\x{0CCD}\x{0D4D}\x{0E3A}\x{0F84}]\p{L}|[\x{1160}-\x{11A7}]|[\x{11A8}-\x{11FF}]|[\x{FF9E}\x{FF9F}])*</kbd>
+ *
+ * <dt class="REGEX"><kbd>\X</kbd>
+ * <dd class="REGEX">Matches a combining character sequence.
+ * It is equivalent to <kbd>(?:\PM\pM*)</kbd>
* </dl>
* </li>
*
* <li>Character class
* <dl>
- * <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd>
++ * <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub></var><var>R<sub>2</sub></var><var>...</var><var>R<sub>n</sub></var><kbd>]</kbd> (without <a href="#COMMA_OPTION">"," option</a>)
++ * <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd> (with <a href="#COMMA_OPTION">"," option</a>)
* <dd>Positive character class. It matches a character in ranges.
* <dd><var>R<sub>n</sub></var>:
* <ul>
@@ -214,61 +209,65 @@
* <p>This range matches the character.
* <li class="REGEX"><var>C<sub>1</sub></var><kbd>-</kbd><var>C<sub>2</sub></var>
* <p>This range matches a character which has a code point that is >= <var>C<sub>1</sub></var>'s code point and <= <var>C<sub>2</sub></var>'s code point.
- * <li class="REGEX">A POSIX character class: <Kbd>[:alpha:] [:alnum:] [:ascii:] [:cntrl:] [:digit:] [:graph:] [:lower:] [:print:] [:punct:] [:space:] [:upper:] [:xdigit:]</kbd>
++ * <li class="REGEX">A POSIX character class: <Kbd>[:alpha:] [:alnum:] [:ascii:] [:cntrl:] [:digit:] [:graph:] [:lower:] [:print:] [:punct:] [:space:] [:upper:] [:xdigit:]</kbd>,
++ * and negative POSIX character classes in Perl like <kbd>[:^alpha:]</kbd>
* <p>...
* <li class="REGEX"><kbd>\d \D \s \S \w \W \p{</kbd><var>name</var><kbd>} \P{</kbd><var>name</var><kbd>}</kbd>
* <p>These expressions specifies the same ranges as the following expressions.
* </ul>
* <p class="REGEX">Enumerated ranges are merged (union operation).
- * <kbd>[a-e,c-z]</kbd> is equivalent to <kbd>[a-z]</kbd>
+ * <kbd>[a-ec-z]</kbd> is equivalent to <kbd>[a-z]</kbd>
*
- * <dt class="REGEX"><kbd>[^</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd>
+ * <dt class="REGEX"><kbd>[^</kbd><var>R<sub>1</sub></var><var>R<sub>2</sub></var><var>...</var><var>R<sub>n</sub></var><kbd>]</kbd> (without a <a href="#COMMA_OPTION">"," option</a>)
+ * <dt class="REGEX"><kbd>[^</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd> (with a <a href="#COMMA_OPTION">"," option</a>)
* <dd>Negative character class. It matches a character not in ranges.
*
* <dt class="REGEX"><kbd>(?[</kbd><var>ranges</var><kbd>]</kbd><var>op</var><kbd>[</kbd><var>ranges</var><kbd>]</kbd><var>op</var><kbd>[</kbd><var>ranges</var><kbd>]</kbd> ... <Kbd>)</kbd>
- * (<var>op</var> is <kbd>-</kbd> or <kbd>+</kbd> or <kbd>&</kbd>.
- * <var>ranges</var> is <var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var> or <kbd>^</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var>.)
- * <dd class="WARNING">This feature is highly experimental.</dd>
+ * (<var>op</var> is <kbd>-</kbd> or <kbd>+</kbd> or <kbd>&</kbd>.)
* <dd>Subtraction or union or intersection for character classes.
- * <dd class="REGEX">For exmaple, <kbd>(?[A-Z]-[C,F])</kbd> is equivalent to <kbd>[A-B,D-E,G-Z]</kbd>, and <kbd>(?[0x00-0x7f]-[K]&[\p{Lu}])</kbd> is equivalent to <kbd>[A-J,L-Z]</kbd>.
+ * <dd class="REGEX">For exmaple, <kbd>(?[A-Z]-[CF])</kbd> is equivalent to <kbd>[A-BD-EG-Z]</kbd>, and <kbd>(?[0x00-0x7f]-[K]&[\p{Lu}])</kbd> is equivalent to <kbd>[A-JL-Z]</kbd>.
* <dd>The result of this operations is a <u>positive character class</u>
* even if an expression includes any negative character classes.
* You have to take care on this in case-insensitive matching.
- * For instance, <kbd>(?[^b])</kbd> is equivalent to <kbd>[\x00-a,c-\x{10ffff}]</kbd>,
+ * For instance, <kbd>(?[^b])</kbd> is equivalent to <kbd>[\x00-ac-\x{10ffff}]</kbd>,
* which is equivalent to <kbd>[^b]</kbd> in case-sensitive matching.
* But, in case-insensitive matching, <kbd>(?[^b])</kbd> matches any character because
* it includes '<kbd>B</kbd>' and '<kbd>B</kbd>' matches '<kbd>b</kbd>'
- * though <kbd>[^b]</kbd> is processed as <kbd>[^B,b]</kbd>.
+ * though <kbd>[^b]</kbd> is processed as <kbd>[^Bb]</kbd>.
*
+ * <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub>R<sub>2</sub>...</var><kbd>-[</kbd><var>R<sub>n</sub>R<sub>n+1</sub>...</var><kbd>]]</kbd> (with an <a href="#X_OPTION">"X" option</a>)</dt>
+ * <dd>Character class subtraction for the XML Schema.
+ * You can use this syntax when you specify an <a href="#X_OPTION">"X" option</a>.
+ *
* <dt class="REGEX"><kbd>\d</kbd>
* <dd class="REGEX">Equivalent to <kbd>[0-9]</kbd>.
- * <dd>When <a href="#U_OPTION">the "u" option</a> is set, it is equivalent to
+ * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
* <span class="REGEX"><kbd>\p{Nd}</kbd></span>.
*
* <dt class="REGEX"><kbd>\D</kbd>
* <dd class="REGEX">Equivalent to <kbd>[^0-9]</kbd>
- * <dd>When <a href="#U_OPTION">the "u" option</a> is set, it is equivalent to
+ * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
* <span class="REGEX"><kbd>\P{Nd}</kbd></span>.
*
* <dt class="REGEX"><kbd>\s</kbd>
- * <dd class="REGEX">Equivalent to <kbd>[ ,\f,\n,\r,\t]</kbd>
- * <dd>When <a href="#U_OPTION">the "u" option</a> is set, it is equivalent to
- * <span class="REGEX"><kbd>[ ,\f,\n,\r,\t,\p{Z}]</kbd></span>.
+ * <dd class="REGEX">Equivalent to <kbd>[ \f\n\r\t]</kbd>
+ * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
+ * <span class="REGEX"><kbd>[ \f\n\r\t\p{Z}]</kbd></span>.
*
* <dt class="REGEX"><kbd>\S</kbd>
- * <dd class="REGEX">Equivalent to <kbd>[^ ,\f,\n,\r,\t]</kbd>
- * <dd>When <a href="#U_OPTION">the "u" option</a> is set, it is equivalent to
- * <span class="REGEX"><kbd>[^ ,\f,\n,\r,\t,\p{Z}]</kbd></span>.
+ * <dd class="REGEX">Equivalent to <kbd>[^ \f\n\r\t]</kbd>
+ * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
+ * <span class="REGEX"><kbd>[^ \f\n\r\t\p{Z}]</kbd></span>.
*
* <dt class="REGEX"><kbd>\w</kbd>
- * <dd class="REGEX">Equivalent to <kbd>[a-z,A-Z,0-9,_]</kbd>
- * <dd>When <a href="#U_OPTION">the "u" option</a> is set, it is equivalent to
- * <span class="REGEX"><kbd>[\p{Lu},\p{Ll},\p{Lo},\p{Nd},_]</kbd></span>.
+ * <dd class="REGEX">Equivalent to <kbd>[a-zA-Z0-9_]</kbd>
+ * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
+ * <span class="REGEX"><kbd>[\p{Lu}\p{Ll}\p{Lo}\p{Nd}_]</kbd></span>.
*
* <dt class="REGEX"><kbd>\W</kbd>
- * <dd class="REGEX">Equivalent to <kbd>[^a-z,A-Z,0-9,_]</kbd>
- * <dd>When <a href="#U_OPTION">the "u" option</a> is set, it is equivalent to
- * <span class="REGEX"><kbd>[^\p{Lu},\p{Ll},\p{Lo},\p{Nd},_]</kbd></span>.
+ * <dd class="REGEX">Equivalent to <kbd>[^a-zA-Z0-9_]</kbd>
+ * <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
+ * <span class="REGEX"><kbd>[^\p{Lu}\p{Ll}\p{Lo}\p{Nd}_]</kbd></span>.
*
* <dt class="REGEX"><kbd>\p{</kbd><var>name</var><kbd>}</kbd>
* <dd>Matches one character in the specified General Category (the second field in <a href="ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt"><kbd>UnicodeData.txt</kbd></a>) or the specified <a href="ftp://ftp.unicode.org/Public/UNIDATA/Blocks.txt">Block</a>.
@@ -403,7 +402,7 @@
*
* <dt class="REGEX"><kbd>^</kbd>
* <dd>Matches the beginning of the text. It is equivalent to <span class="REGEX"><Kbd>\A</kbd></span>.
- * <dd>When <a href="#M_OPTION">the "m" option</a> is set,
+ * <dd>When <a href="#M_OPTION">a "m" option</a> is set,
* it matches the beginning of the text, or after one of EOL characters (
* LINE FEED (U+000A), CARRIAGE RETURN (U+000D), LINE SEPARATOR (U+2028),
* PARAGRAPH SEPARATOR (U+2029).)
@@ -411,24 +410,24 @@
* <dt class="REGEX"><kbd>$</kbd>
* <dd>Matches the end of the text, or before an EOL character at the end of the text,
* or CARRIAGE RETURN + LINE FEED at the end of the text.
- * <dd>When <a href="#M_OPTION">the "m" option</a> is set,
+ * <dd>When <a href="#M_OPTION">a "m" option</a> is set,
* it matches the end of the text, or before an EOL character.
*
* <dt class="REGEX"><kbd>\b</kbd>
* <dd>Matches word boundary.
- * (See <a href="#W_OPTION">the "w" option</a>)
+ * (See <a href="#W_OPTION">a "w" option</a>)
*
* <dt class="REGEX"><kbd>\B</kbd>
* <dd>Matches non word boundary.
- * (See <a href="#W_OPTION">the "w" option</a>)
+ * (See <a href="#W_OPTION">a "w" option</a>)
*
* <dt class="REGEX"><kbd>\<</kbd>
* <dd>Matches the beginning of a word.
- * (See <a href="#W_OPTION">the "w" option</a>)
+ * (See <a href="#W_OPTION">a "w" option</a>)
*
* <dt class="REGEX"><kbd>\></kbd>
* <dd>Matches the end of a word.
- * (See <a href="#W_OPTION">the "w" option</a>)
+ * (See <a href="#W_OPTION">a "w" option</a>)
* </dl>
* </li>
* <li>Lookahead and lookbehind
@@ -493,7 +492,7 @@
*
* char-class ::= '[' ranges ']'
* | '(?[' ranges ']' ([-+&] '[' ranges ']')? ')'
- * ranges ::= '^'? (range ','?)+
+ * ranges ::= '^'? (range <a href="#COMMA_OPTION">','?</a>)+
* range ::= '\d' | '\w' | '\s' | '\D' | '\W' | '\S' | category-block
* | range-char | range-char '-' range-char
* range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | code-point | character-2
@@ -523,6 +522,9 @@
public class RegularExpression implements java.io.Serializable {
static final boolean DEBUG = false;
+ /**
+ * Compiles a token tree into an operation flow.
+ */
private synchronized void compile(Token tok) {
if (this.operations != null)
return;
@@ -530,6 +532,9 @@
this.operations = this.compile(tok, null, false);
}
+ /**
+ * Converts a token to an operation.
+ */
private Op compile(Token tok, Op next, boolean reverse) {
Op ret;
switch (tok.type) {
@@ -688,39 +693,47 @@
//Public
-/**
- *
- * @return true if the target is matched to this regular expression.
- */
+ /**
+ * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
+ *
+ * @return true if the target is matched to this regular expression.
+ */
public boolean matches(char[] target) {
return this.matches(target, 0, target .length , (Match)null);
}
-/**
- *
- * @return true if the target is matched to this regular expression.
- */
+ /**
+ * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
+ * in specified range or not.
+ *
+ * @param start Start offset of the range.
+ * @param end End offset +1 of the range.
+ * @return true if the target is matched to this regular expression.
+ */
public boolean matches(char[] target, int start, int end) {
return this.matches(target, start, end, (Match)null);
}
-/**
- *
- * @param match A Match instance for storing matching result.
- * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
- */
+ /**
+ * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
+ *
+ * @param match A Match instance for storing matching result.
+ * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
+ */
public boolean matches(char[] target, Match match) {
return this.matches(target, 0, target .length , match);
}
-/**
- *
- * @param match A Match instance for storing matching result.
- * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
- */
-
-
+ /**
+ * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
+ * in specified range or not.
+ *
+ * @param start Start offset of the range.
+ * @param end End offset +1 of the range.
+ * @param match A Match instance for storing matching result.
+ * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
+ */
public boolean matches(char[] target, int start, int end, Match match) {
synchronized (this) {
@@ -799,7 +812,8 @@
/*
* Checks whether the expression starts with ".*".
*/
- if (this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) {
+ if (this.operations != null
+ && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) {
if (isSet(this.options, SINGLE_LINE)) {
matchStart = con.start;
matchEnd = this. matchCharArray (con, this.operations, con.start, 1, this.options);
@@ -1387,42 +1401,46 @@
-/**
- *
- * @return true if the target is matched to this regular expression.
- */
+ /**
+ * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
+ *
+ * @return true if the target is matched to this regular expression.
+ */
public boolean matches(String target) {
return this.matches(target, 0, target .length() , (Match)null);
}
-/**
- *
- * @return true if the target is matched to this regular expression.
- */
+ /**
+ * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
+ * in specified range or not.
+ *
+ * @param start Start offset of the range.
+ * @param end End offset +1 of the range.
+ * @return true if the target is matched to this regular expression.
+ */
public boolean matches(String target, int start, int end) {
return this.matches(target, start, end, (Match)null);
}
-/**
- *
- * @param match A Match instance for storing matching result.
- * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
- */
+ /**
+ * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
+ *
+ * @param match A Match instance for storing matching result.
+ * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
+ */
public boolean matches(String target, Match match) {
return this.matches(target, 0, target .length() , match);
}
-
-
-/**
- *
- * @param match A Match instance for storing matching result.
- * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
- */
-
-
-
-
+ /**
+ * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
+ * in specified range or not.
+ *
+ * @param start Start offset of the range.
+ * @param end End offset +1 of the range.
+ * @param match A Match instance for storing matching result.
+ * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
+ */
public boolean matches(String target, int start, int end, Match match) {
synchronized (this) {
@@ -1501,7 +1519,8 @@
/*
* Checks whether the expression starts with ".*".
*/
- if (this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) {
+ if (this.operations != null
+ && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) {
if (isSet(this.options, SINGLE_LINE)) {
matchStart = con.start;
matchEnd = this. matchString (con, this.operations, con.start, 1, this.options);
@@ -1584,9 +1603,9 @@
}
}
-/**
- * @return -1 when not match; offset of the end of matched string when match.
- */
+ /**
+ * @return -1 when not match; offset of the end of matched string when match.
+ */
private int matchString (Context con, Op op, int offset, int dx, int opts) {
@@ -2048,21 +2067,22 @@
-/**
- *
- * @return true if the target is matched to this regular expression.
- */
+ /**
+ * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
+ *
+ * @return true if the target is matched to this regular expression.
+ */
public boolean matches(CharacterIterator target) {
return this.matches(target, (Match)null);
}
-
-/**
- *
- * @param match A Match instance for storing matching result.
- * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
- */
+ /**
+ * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
+ *
+ * @param match A Match instance for storing matching result.
+ * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
+ */
public boolean matches(CharacterIterator target, Match match) {
int start = target.getBeginIndex();
int end = target.getEndIndex();
@@ -2145,7 +2165,8 @@
/*
* Checks whether the expression starts with ".*".
*/
- if (this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) {
+ if (this.operations != null
+ && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) {
if (isSet(this.options, SINGLE_LINE)) {
matchStart = con.start;
matchEnd = this. matchCharacterIterator (con, this.operations, con.start, 1, this.options);
@@ -2228,9 +2249,9 @@
}
}
-/**
- * @return -1 when not match; offset of the end of matched string when match.
- */
+ /**
+ * @return -1 when not match; offset of the end of matched string when match.
+ */
private int matchCharacterIterator (Context con, Op op, int offset, int dx, int opts) {
@@ -2690,9 +2711,9 @@
return true;
}
-/**
- * @see java.lang.String#regionMatches
- */
+ /**
+ * @see java.lang.String#regionMatches
+ */
private static final boolean regionMatchesIgnoreCase(CharacterIterator target, int offset, int limit,
String part, int partlen) {
if (offset < 0) return false;
@@ -2817,6 +2838,9 @@
}
}
+ /**
+ * Prepares for matching. This method is called just before starting matching.
+ */
void prepare() {
if (Op.COUNT) Op.nofinstances = 0;
this.compile(this.tokentree);
@@ -2844,7 +2868,8 @@
}
}
- if ((this.operations.type == Op.STRING || this.operations.type == Op.CHAR)
+ if (this.operations != null
+ && (this.operations.type == Op.STRING || this.operations.type == Op.CHAR)
&& this.operations.next == null) {
if (DEBUG)
System.err.print(" *** Only fixed string! *** ");
@@ -2950,14 +2975,18 @@
* "X". XML Schema mode.
*/
static final int XMLSCHEMA_MODE = 1<<9;
+ /**
+ * ",".
+ */
+ static final int SPECIAL_COMMA = 1<<10;
private static final boolean isSet(int options, int flag) {
- return(options & flag) == flag;
+ return (options & flag) == flag;
}
/**
- * Constructor.
+ * Creates a new RegularExpression instance.
*
* @param regex A regular expression
* @exception org.apache.xerces.utils.regex.ParseException <VAR>regex</VAR> is not conforming to the syntax.
@@ -2967,10 +2996,10 @@
}
/**
- * Constructor.
+ * Creates a new RegularExpression instance with options.
*
* @param regex A regular expression
- * @param options A String consisted of "i" "m" "s" "u" "w"
+ * @param options A String consisted of "i" "m" "s" "u" "w" "," "X"
* @exception org.apache.xerces.utils.regex.ParseException <VAR>regex</VAR> is not conforming to the syntax.
*/
public RegularExpression(String regex, String options) throws ParseException {
@@ -3022,10 +3051,13 @@
* Represents this instence in String.
*/
public String toString() {
- return this.tokentree.toString();
+ return this.tokentree.toString(this.options);
}
/**
+ * Returns a option string.
+ * The order of letters in it may be different from a string specified
+ * in a constructor or <code>setPattern()</code>.
*
* @see #RegularExpression(java.lang.String,java.lang.String)
* @see #setPattern(java.lang.String,java.lang.String)
@@ -3035,7 +3067,7 @@
}
/**
- *
+ * Return true if patterns are the same and the options are equivalent.
*/
public boolean equals(Object obj) {
if (obj == null) return false;
@@ -3053,7 +3085,7 @@
*
*/
public int hashCode() {
- return(this.regex+"/"+this.getOptions()).hashCode();
+ return (this.regex+"/"+this.getOptions()).hashCode();
}
/**
1.3 +5 -0 xml-xerces/java/src/org/apache/xerces/utils/regex/REUtil.java
Index: REUtil.java
===================================================================
RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/utils/regex/REUtil.java,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- REUtil.java 2000/05/17 18:32:54 1.2
+++ REUtil.java 2000/06/20 23:57:18 1.3
@@ -123,6 +123,9 @@
case 'X':
ret = RegularExpression.XMLSCHEMA_MODE;
break;
+ case ',':
+ ret = RegularExpression.SPECIAL_COMMA;
+ break;
default:
}
return ret;
@@ -160,6 +163,8 @@
sb.append((char)'w');
if ((options & RegularExpression.EXTENDED_COMMENT) != 0)
sb.append((char)'x');
+ if ((options & RegularExpression.SPECIAL_COMMA) != 0)
+ sb.append((char)',');
return sb.toString().intern();
}
1.3 +58 -40 xml-xerces/java/src/org/apache/xerces/utils/regex/Token.java
Index: Token.java
===================================================================
RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/utils/regex/Token.java,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- Token.java 2000/05/17 18:32:57 1.2
+++ Token.java 2000/06/20 23:57:18 1.3
@@ -276,6 +276,9 @@
}
public String toString() {
+ return this.toString(0);
+ }
+ public String toString(int options) {
return this.type == Token.DOT ? "." : "";
}
@@ -811,10 +814,11 @@
Token.categories.put(n, r1);
Token.categories2.put(n, Token.complementRanges(r1));
if (n.indexOf(' ') >= 0) {
- StringBuffer buffer = new StringBuffer(n.length());
+ StringBuffer buffer = new StringBuffer(n.length()+2);
+ buffer.append("Is");
for (int ci = 0; ci < n.length(); ci ++)
if (n.charAt(ci) != ' ') buffer.append((char)n.charAt(ci));
- Token.setAlias(buffer.toString(), n, true);
+ Token.setAlias(new String(buffer), n, true);
}
}
@@ -961,6 +965,20 @@
return Token.token_grapheme;
}
+ /**
+ * Combing Character Sequence in Perl 5.6.
+ */
+ static private Token token_ccs = null;
+ static synchronized protected Token getCombiningCharacterSequence() {
+ if (Token.token_ccs != null)
+ return Token.token_ccs;
+
+ Token foo = Token.createClosure(Token.getRange("M", true)); // \pM*
+ foo = Token.createConcat(Token.getRange("M", false), foo); // \PM + \pM*
+ Token.token_ccs = foo;
+ return Token.token_ccs;
+ }
+
// ------------------------------------------------------
// ------------------------------------------------------
@@ -984,7 +1002,7 @@
return this.string;
}
- public String toString() {
+ public String toString(int options) {
if (this.type == BACKREFERENCE)
return "\\"+this.refNumber;
else
@@ -1012,14 +1030,14 @@
return index == 0 ? this.child : this.child2;
}
- public String toString() {
+ public String toString(int options) {
String ret;
if (this.child2.type == CLOSURE && this.child2.getChild(0) == this.child) {
- ret = this.child.toString()+"+";
+ ret = this.child.toString(options)+"+";
} else if (this.child2.type == NONGREEDYCLOSURE && this.child2.getChild(0) == this.child) {
- ret = this.child.toString()+"+?";
+ ret = this.child.toString(options)+"+?";
} else
- ret = this.child.toString()+this.child2.toString();
+ ret = this.child.toString(options)+this.child2.toString(options);
return ret;
}
}
@@ -1039,7 +1057,7 @@
return this.chardata;
}
- public String toString() {
+ public String toString(int options) {
String ret;
switch (this.type) {
case CHAR:
@@ -1120,29 +1138,29 @@
return this.max;
}
- public String toString() {
+ public String toString(int options) {
String ret;
if (this.type == CLOSURE) {
if (this.getMin() < 0 && this.getMax() < 0) {
- ret = this.child.toString()+"*";
+ ret = this.child.toString(options)+"*";
} else if (this.getMin() == this.getMax()) {
- ret = this.child.toString()+"{"+this.getMin()+"}";
+ ret = this.child.toString(options)+"{"+this.getMin()+"}";
} else if (this.getMin() >= 0 && this.getMax() >= 0) {
- ret = this.child.toString()+"{"+this.getMin()+","+this.getMax()+"}";
+ ret = this.child.toString(options)+"{"+this.getMin()+","+this.getMax()+"}";
} else if (this.getMin() >= 0 && this.getMax() < 0) {
- ret = this.child.toString()+"{"+this.getMin()+",}";
+ ret = this.child.toString(options)+"{"+this.getMin()+",}";
} else
throw new RuntimeException("Token#toString(): CLOSURE "
+this.getMin()+", "+this.getMax());
} else {
if (this.getMin() < 0 && this.getMax() < 0) {
- ret = this.child.toString()+"*?";
+ ret = this.child.toString(options)+"*?";
} else if (this.getMin() == this.getMax()) {
- ret = this.child.toString()+"{"+this.getMin()+"}?";
+ ret = this.child.toString(options)+"{"+this.getMin()+"}?";
} else if (this.getMin() >= 0 && this.getMax() >= 0) {
- ret = this.child.toString()+"{"+this.getMin()+","+this.getMax()+"}?";
+ ret = this.child.toString(options)+"{"+this.getMin()+","+this.getMax()+"}?";
} else if (this.getMin() >= 0 && this.getMax() < 0) {
- ret = this.child.toString()+"{"+this.getMin()+",}?";
+ ret = this.child.toString(options)+"{"+this.getMin()+",}?";
} else
throw new RuntimeException("Token#toString(): NONGREEDYCLOSURE "
+this.getMin()+", "+this.getMax());
@@ -1175,31 +1193,31 @@
return this.parennumber;
}
- public String toString() {
+ public String toString(int options) {
String ret = null;
switch (this.type) {
case PAREN:
if (this.parennumber == 0) {
- ret = "(?:"+this.child.toString()+")";
+ ret = "(?:"+this.child.toString(options)+")";
} else {
- ret = "("+this.child.toString()+")";
+ ret = "("+this.child.toString(options)+")";
}
break;
case LOOKAHEAD:
- ret = "(?="+this.child.toString()+")";
+ ret = "(?="+this.child.toString(options)+")";
break;
case NEGATIVELOOKAHEAD:
- ret = "(?!"+this.child.toString()+")";
+ ret = "(?!"+this.child.toString(options)+")";
break;
case LOOKBEHIND:
- ret = "(?<="+this.child.toString()+")";
+ ret = "(?<="+this.child.toString(options)+")";
break;
case NEGATIVELOOKBEHIND:
- ret = "(?<!"+this.child.toString()+")";
+ ret = "(?<!"+this.child.toString(options)+")";
break;
case INDEPENDENT:
- ret = "(?>"+this.child.toString()+")";
+ ret = "(?>"+this.child.toString(options)+")";
break;
}
return ret;
@@ -1230,7 +1248,7 @@
throw new RuntimeException("Internal Error: "+index);
}
- public String toString() {
+ public String toString(int options) {
String ret;
if (refNumber > 0) {
ret = "(?("+refNumber+")";
@@ -1278,12 +1296,12 @@
return this.mask;
}
- public String toString() {
+ public String toString(int options) {
return "(?"
+(this.add == 0 ? "" : REUtil.createOptionString(this.add))
+(this.mask == 0 ? "" : REUtil.createOptionString(this.mask))
+":"
- +this.child.toString()
+ +this.child.toString(options)
+")";
}
}
@@ -1352,7 +1370,7 @@
buffer.append(tok.getString());
}
- ((StringToken)previous).string = buffer.toString();
+ ((StringToken)previous).string = new String(buffer);
}
int size() {
@@ -1362,40 +1380,40 @@
return (Token)this.children.elementAt(index);
}
- public String toString() {
+ public String toString(int options) {
String ret;
if (this.type == CONCAT) {
if (this.children.size() == 2) {
Token ch = this.getChild(0);
Token ch2 = this.getChild(1);
if (ch2.type == CLOSURE && ch2.getChild(0) == ch) {
- ret = ch.toString()+"+";
+ ret = ch.toString(options)+"+";
} else if (ch2.type == NONGREEDYCLOSURE && ch2.getChild(0) == ch) {
- ret = ch.toString()+"+?";
+ ret = ch.toString(options)+"+?";
} else
- ret = ch.toString()+ch2.toString();
+ ret = ch.toString(options)+ch2.toString(options);
} else {
StringBuffer sb = new StringBuffer();
for (int i = 0; i < this.children.size(); i ++) {
- sb.append(this.children.elementAt(i).toString());
+ sb.append(((Token)this.children.elementAt(i)).toString(options));
}
- ret = sb.toString();
+ ret = new String(sb);
}
return ret;
}
if (this.children.size() == 2 && this.getChild(1).type == EMPTY) {
- ret = this.getChild(0).toString()+"?";
+ ret = this.getChild(0).toString(options)+"?";
} else if (this.children.size() == 2
&& this.getChild(0).type == EMPTY) {
- ret = this.getChild(1).toString()+"??";
+ ret = this.getChild(1).toString(options)+"??";
} else {
StringBuffer sb = new StringBuffer();
- sb.append(this.children.elementAt(0).toString());
+ sb.append(((Token)this.children.elementAt(0)).toString(options));
for (int i = 1; i < this.children.size(); i ++) {
sb.append((char)'|');
- sb.append(this.children.elementAt(i).toString());
+ sb.append(((Token)this.children.elementAt(i)).toString(options));
}
- ret = sb.toString();
+ ret = new String(sb);
}
return ret;
}