You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@xerces.apache.org by je...@locus.apache.org on 2000/06/21 01:57:26 UTC

cvs commit: xml-xerces/java/src/org/apache/xerces/utils/regex BMPattern.java Match.java ParserForXMLSchema.java RangeToken.java RegexParser.java RegularExpression.java REUtil.java Token.java

jeffreyr    00/06/20 16:57:26

  Modified:    java/src/org/apache/xerces/utils/regex BMPattern.java
                        Match.java ParserForXMLSchema.java RangeToken.java
                        RegexParser.java RegularExpression.java REUtil.java
                        Token.java
  Log:
  It enables character class subtraction like [a-z-[c]] and fixes some bugs. Kento Tamura fix
  
  Revision  Changes    Path
  1.2       +3 -3      xml-xerces/java/src/org/apache/xerces/utils/regex/BMPattern.java
  
  Index: BMPattern.java
  ===================================================================
  RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/utils/regex/BMPattern.java,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- BMPattern.java	2000/04/04 19:31:31	1.1
  +++ BMPattern.java	2000/06/20 23:57:18	1.2
  @@ -101,7 +101,7 @@
   
       /**
        *
  -     * @return -1 if iterator does not contain this pattern.
  +     * @return -1 if <var>iterator</var> does not contain this pattern.
        */
       public int matches(CharacterIterator iterator, int start, int limit) {
           if (this.ignoreCase)  return this.matchesIgnoreCase(iterator, start, limit);
  @@ -126,7 +126,7 @@
   
       /**
        *
  -     * @return -1 if str does not contain this pattern.
  +     * @return -1 if <var>str</var> does not contain this pattern.
        */
       public int matches(String str, int start, int limit) {
           if (this.ignoreCase)  return this.matchesIgnoreCase(str, start, limit);
  @@ -151,7 +151,7 @@
       }
       /**
        *
  -     * @return -1 if str does not contain this pattern.
  +     * @return -1 if <var>chars</char> does not contain this pattern.
        */
       public int matches(char[] chars, int start, int limit) {
           if (this.ignoreCase)  return this.matchesIgnoreCase(chars, start, limit);
  
  
  
  1.2       +36 -4     xml-xerces/java/src/org/apache/xerces/utils/regex/Match.java
  
  Index: Match.java
  ===================================================================
  RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/utils/regex/Match.java,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- Match.java	2000/04/04 19:31:31	1.1
  +++ Match.java	2000/06/20 23:57:18	1.2
  @@ -62,6 +62,13 @@
   
   /**
    * 
  + * An instance of this class has ranges captured in matching.
  + *
  + * @see org.apache.xerces.utils.regex.RegularExpression#matches(char[], int, int, org.apache.xerces.utils.regex.Match)
  + * @see org.apache.xerces.utils.regex.RegularExpression#matches(char[], org.apache.xerces.utils.regex.Match)
  + * @see org.apache.xerces.utils.regex.RegularExpression#matches(java.text.CharacterIterator, org.apache.xerces.utils.regex.Match)
  + * @see org.apache.xerces.utils.regex.RegularExpression#matches(java.lang.String, int, int, org.apache.xerces.utils.regex.Match)
  + * @see org.apache.xerces.utils.regex.RegularExpression#matches(java.lang.String, org.apache.xerces.utils.regex.Match)
    * @author TAMURA Kent &lt;kent@trl.ibm.co.jp&gt;
    */
   public class Match implements Cloneable {
  @@ -73,9 +80,15 @@
       String strSource = null;
       char[] charSource = null;
   
  +    /**
  +     * Creates an instance.
  +     */
       public Match() {
       }
   
  +    /**
  +     *
  +     */
       public synchronized Object clone() {
           Match ma = new Match();
           if (this.nofgroups > 0) {
  @@ -90,6 +103,9 @@
           return ma;
       }
   
  +    /**
  +     *
  +     */
       protected void setNumberOfGroups(int n) {
           int oldn = this.nofgroups;
           this.nofgroups = n;
  @@ -104,26 +120,41 @@
           }
       }
   
  +    /**
  +     *
  +     */
       protected void setSource(CharacterIterator ci) {
           this.ciSource = ci;
           this.strSource = null;
           this.charSource = null;
       }
  +    /**
  +     *
  +     */
       protected void setSource(String str) {
           this.ciSource = null;
           this.strSource = str;
           this.charSource = null;
       }
  +    /**
  +     *
  +     */
       protected void setSource(char[] chars) {
           this.ciSource = null;
           this.strSource = null;
           this.charSource = chars;
       }
   
  +    /**
  +     *
  +     */
       protected void setBeginning(int index, int v) {
           this.beginpos[index] = v;
       }
   
  +    /**
  +     *
  +     */
       protected void setEnd(int index, int v) {
           this.endpos[index] = v;
       }
  @@ -178,13 +209,14 @@
               throw new IllegalArgumentException("The parameter must be less than "
                                                  +this.nofgroups+": "+index);
           String ret;
  +        int begin = this.beginpos[index], end = this.endpos[index];
  +        if (begin < 0 || end < 0)  return null;
           if (this.ciSource != null) {
  -            ret = REUtil.substring(this.ciSource, this.beginpos[index], this.endpos[index]);
  +            ret = REUtil.substring(this.ciSource, begin, end);
           } else if (this.strSource != null) {
  -            ret = this.strSource.substring(this.beginpos[index], this.endpos[index]);
  +            ret = this.strSource.substring(begin, end);
           } else {
  -            int begin = this.beginpos[index];
  -            ret = new String(this.charSource, begin, this.endpos[index]-begin);
  +            ret = new String(this.charSource, begin, end-begin);
           }
           return ret;
       }
  
  
  
  1.3       +143 -8    xml-xerces/java/src/org/apache/xerces/utils/regex/ParserForXMLSchema.java
  
  Index: ParserForXMLSchema.java
  ===================================================================
  RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/utils/regex/ParserForXMLSchema.java,v
  retrieving revision 1.2
  retrieving revision 1.3
  diff -u -r1.2 -r1.3
  --- ParserForXMLSchema.java	2000/06/02 23:08:32	1.2
  +++ ParserForXMLSchema.java	2000/06/20 23:57:18	1.3
  @@ -62,14 +62,10 @@
   import java.util.Hashtable;
   import java.util.Locale;
   
  -/*
  - * TODO:
  - * Grammar of character classes
  - * Shorthands in character classes
  - */
  -
   /**
  + * A regular expression parser for the XML Shema.
    *
  + * @author TAMURA Kent &lt;kent@trl.ibm.co.jp&gt;
    */
   class ParserForXMLSchema extends RegexParser {
   
  @@ -176,11 +172,14 @@
           this.next();
           return this.getTokenForShorthand('I');
       }
  +    Token processBacksolidus_g() throws ParseException {
  +        throw this.ex("parser.process.1", this.offset-2);
  +    }
       Token processBacksolidus_X() throws ParseException {
  -        throw ex("parser.process.1", this.offset);
  +        throw ex("parser.process.1", this.offset-2);
       }
       Token processBackreference() throws ParseException {
  -        throw ex("parser.process.1", this.offset);
  +        throw ex("parser.process.1", this.offset-4);
       }
   
       int processCIinCharacterClass(RangeToken tok, int c) {
  @@ -189,6 +188,142 @@
       }
   
   
  +    /**
  +     * Parses a character-class-expression, not a character-class-escape.
  +     *
  +     * c-c-expression   ::= '[' c-group ']'
  +     * c-group          ::= positive-c-group | negative-c-group | c-c-subtraction
  +     * positive-c-group ::= (c-range | c-c-escape)+
  +     * negative-c-group ::= '^' positive-c-group
  +     * c-c-subtraction  ::= (positive-c-group | negative-c-group) subtraction
  +     * subtraction      ::= '-' c-c-expression
  +     * c-range          ::= single-range | from-to-range
  +     * single-range     ::= multi-c-escape | category-c-escape | block-c-escape | <any XML char>
  +     * cc-normal-c      ::= <any character except [, ], \>
  +     * from-to-range    ::= cc-normal-c '-' cc-normal-c
  +     *
  +     * @param useNrage Ignored.
  +     * @return This returns no NrageToken.
  +     */
  +    protected RangeToken parseCharacterClass(boolean useNrange) throws ParseException {
  +        this.setContext(S_INBRACKETS);
  +        this.next();                            // '['
  +        boolean nrange = false;
  +        RangeToken base = null;
  +        RangeToken tok;
  +        if (this.read() == T_CHAR && this.chardata == '^') {
  +            nrange = true;
  +            this.next();                        // '^'
  +            base = Token.createRange();
  +            base.addRange(0, Token.UTF16_MAX);
  +            tok = Token.createRange();
  +        } else {
  +            tok = Token.createRange();
  +        }
  +        int type;
  +        boolean firstloop = true;
  +        while ((type = this.read()) != T_EOF) { // Don't use 'cotinue' for this loop.
  +            // single-range | from-to-range | subtraction
  +            if (type == T_CHAR && this.chardata == ']' && !firstloop) {
  +                if (nrange) {
  +                    base.subtractRanges(tok);
  +                    tok = base;
  +                }
  +                break;
  +            }
  +            int c = this.chardata;
  +            boolean end = false;
  +            if (type == T_BACKSOLIDUS) {
  +                switch (c) {
  +                  case 'd':  case 'D':
  +                  case 'w':  case 'W':
  +                  case 's':  case 'S':
  +                    tok.mergeRanges(this.getTokenForShorthand(c));
  +                    end = true;
  +                    break;
  +
  +                  case 'i':  case 'I':
  +                  case 'c':  case 'C':
  +                    c = this.processCIinCharacterClass(tok, c);
  +                    if (c < 0)  end = true;
  +                    break;
  +                    
  +                  case 'p':
  +                  case 'P':
  +                    int pstart = this.offset;
  +                    RangeToken tok2 = this.processBacksolidus_pP(c);
  +                    if (tok2 == null)  throw this.ex("parser.atom.5", pstart);
  +                    tok.mergeRanges(tok2);
  +                    end = true;
  +                    break;
  +
  +                  default:
  +                    c = this.decodeEscaped();
  +                } // \ + c
  +            } // backsolidus
  +            else if (type == T_XMLSCHEMA_CC_SUBTRACTION && !firstloop) {
  +                                                // Subraction
  +                if (nrange) {
  +                    base.subtractRanges(tok);
  +                    tok = base;
  +                }
  +                RangeToken range2 = this.parseCharacterClass(false);
  +                tok.subtractRanges(range2);
  +                if (this.read() != T_CHAR || this.chardata != ']')
  +                    throw this.ex("parser.cc.5", this.offset);
  +                break;                          // Exit this loop
  +            }
  +            this.next();
  +            if (!end) {                         // if not shorthands...
  +                if (type == T_CHAR) {
  +                    if (c == '[')  throw this.ex("parser.cc.6", this.offset-2);
  +                    if (c == ']')  throw this.ex("parser.cc.7", this.offset-2);
  +                }
  +                if (this.read() != T_CHAR || this.chardata != '-') { // Here is no '-'.
  +                    tok.addRange(c, c);
  +                } else {                        // Found '-'
  +                                                // Is this '-' is a from-to token??
  +                    this.next(); // Skips '-'
  +                    if ((type = this.read()) == T_EOF)  throw this.ex("parser.cc.2", this.offset);
  +                                                // c '-' ']' -> '-' is a single-range.
  +                    if (type == T_CHAR && this.chardata == ']') {
  +                        tok.addRange(c, c);
  +                        tok.addRange('-', '-');
  +                    }
  +                                                // c '-' '-[' -> '-' is a single-range.
  +                    else if (type == T_XMLSCHEMA_CC_SUBTRACTION) {
  +                        tok.addRange(c, c);
  +                        tok.addRange('-', '-');
  +                    } else {
  +                        int rangeend = this.chardata;
  +                        if (type == T_CHAR) {
  +                            if (rangeend == '[')  throw this.ex("parser.cc.6", this.offset-1);
  +                            if (rangeend == ']')  throw this.ex("parser.cc.7", this.offset-1);
  +                        }
  +                        if (type == T_BACKSOLIDUS)
  +                            rangeend = this.decodeEscaped();
  +                        this.next();
  +                        tok.addRange(c, rangeend);
  +                    }
  +                }
  +            }
  +            firstloop = false;
  +        }
  +        if (this.read() == T_EOF)
  +            throw this.ex("parser.cc.2", this.offset);
  +        tok.sortRanges();
  +        tok.compactRanges();
  +        //tok.dumpRanges();
  +        this.setContext(S_NORMAL);
  +        this.next();                    // Skips ']'
  +
  +        return tok;
  +    }
  +
  +    protected RangeToken parseSetOperations() throws ParseException {
  +        throw this.ex("parser.process.1", this.offset);
  +    }
  + 
       Token getTokenForShorthand(int ch) {
           switch (ch) {
             case 'd':
  
  
  
  1.2       +8 -9      xml-xerces/java/src/org/apache/xerces/utils/regex/RangeToken.java
  
  Index: RangeToken.java
  ===================================================================
  RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/utils/regex/RangeToken.java,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- RangeToken.java	2000/04/04 19:31:31	1.1
  +++ RangeToken.java	2000/06/20 23:57:18	1.2
  @@ -158,11 +158,12 @@
        */
       protected void compactRanges() {
           boolean DEBUG = false;
  -        if (this.ranges == null || this.ranges.length == 2)
  +        if (this.ranges == null || this.ranges.length <= 2)
               return;
           if (this.isCompacted())
               return;
  -        int base = 0, target = 0;
  +        int base = 0;                           // Index of writing point
  +        int target = 0;                         // Index of processing point
   
           while (target < this.ranges.length) {
               if (base != target) {
  @@ -170,8 +171,6 @@
                   this.ranges[base+1] = this.ranges[target++];
               } else
                   target += 2;
  -            if (target >= this.ranges.length)
  -                break;
               int baseend = this.ranges[base+1];
               while (target < this.ranges.length) {
                   if (baseend+1 < this.ranges[target])
  @@ -217,10 +216,10 @@
                                                  +"] ["+this.ranges[target]
                                                  +","+this.ranges[target+1]+"]");
                   }
  -            }
  +            } // while
               base += 2;
           }
  -        base += 2;
  +
           if (base != this.ranges.length) {
               int[] result = new int[base];
               System.arraycopy(this.ranges, 0, result, 0, base);
  @@ -570,7 +569,7 @@
           //for (int i = 0;  i < asize;  i ++)  System.err.println("Map: "+Integer.toString(this.map[i], 16));
       }
   
  -    public String toString() {
  +    public String toString(int options) {
           String ret;
           if (this.type == RANGE) {
               if (this == Token.token_dot)
  @@ -585,7 +584,7 @@
                   StringBuffer sb = new StringBuffer();
                   sb.append("[");
                   for (int i = 0;  i < this.ranges.length;  i += 2) {
  -                    if (i > 0)  sb.append(",");
  +                    if ((options & RegularExpression.SPECIAL_COMMA) != 0 && i > 0)  sb.append(",");
                       if (this.ranges[i] == this.ranges[i+1]) {
                           sb.append(escapeCharInCharClass(this.ranges[i]));
                       } else {
  @@ -608,7 +607,7 @@
                   StringBuffer sb = new StringBuffer();
                   sb.append("[^");
                   for (int i = 0;  i < this.ranges.length;  i += 2) {
  -                    if (i > 0)  sb.append(",");
  +                    if ((options & RegularExpression.SPECIAL_COMMA) != 0 && i > 0)  sb.append(",");
                       if (this.ranges[i] == this.ranges[i+1]) {
                           sb.append(escapeCharInCharClass(this.ranges[i]));
                       } else {
  
  
  
  1.3       +185 -46   xml-xerces/java/src/org/apache/xerces/utils/regex/RegexParser.java
  
  Index: RegexParser.java
  ===================================================================
  RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/utils/regex/RegexParser.java,v
  retrieving revision 1.2
  retrieving revision 1.3
  diff -u -r1.2 -r1.3
  --- RegexParser.java	2000/05/17 18:32:55	1.2
  +++ RegexParser.java	2000/06/20 23:57:18	1.3
  @@ -87,10 +87,11 @@
       static final int T_NEGATIVELOOKBEHIND = 17; // '(?<!'
       static final int T_INDEPENDENT = 18;        // '(?>'
       static final int T_SET_OPERATIONS = 19;     // '(?['
  -    static final int T_POSIX_CHARCLASS_START = 20; // '[:'
  +    static final int T_POSIX_CHARCLASS_START = 20; // '[:' in a character class
       static final int T_COMMENT = 21;            // '(?#'
       static final int T_MODIFIERS = 22;          // '(?' [\-,a-z,A-Z]
       static final int T_CONDITION = 23;          // '(?('
  +    static final int T_XMLSCHEMA_CC_SUBTRACTION = 24; // '-[' in a character class
   
       static class ReferencePosition {
           int refNumber;
  @@ -108,8 +109,9 @@
       ResourceBundle resources;
       int chardata;
       int nexttoken;
  -    static private final int S_NORMAL = 0;
  -    static private final int S_INBRACKETS = 1;
  +    static protected final int S_NORMAL = 0;
  +    static protected final int S_INBRACKETS = 1;
  +    static protected final int S_INXBRACKETS = 2;
       int context = S_NORMAL;
       int parennumber = 1;
       boolean hasBackReferences;
  @@ -166,12 +168,14 @@
           return ret;
       }
   
  +    /*
       public RegularExpression createRegex(String regex, int options) throws ParseException {
           Token tok = this.parse(regex, options);
           return new RegularExpression(regex, tok, this.parennumber, this.hasBackReferences, options);
       }
  +    */
   
  -    private final void setContext(int con) {
  +    protected final void setContext(int con) {
           this.context = con;
       }
   
  @@ -201,8 +205,18 @@
                   this.chardata = this.regex.charAt(this.offset++);
                   break;
   
  +              case '-':
  +                if (this.isSet(RegularExpression.XMLSCHEMA_MODE)
  +                    && this.offset < this.regexlen && this.regex.charAt(this.offset) == '[') {
  +                    this.offset++;
  +                    ret = T_XMLSCHEMA_CC_SUBTRACTION;
  +                } else
  +                    ret = T_CHAR;
  +                break;
  +
                 case '[':
  -                if (this.offset < this.regexlen && this.regex.charAt(this.offset) == ':') {
  +                if (!this.isSet(RegularExpression.XMLSCHEMA_MODE)
  +                    && this.offset < this.regexlen && this.regex.charAt(this.offset) == ':') {
                       this.offset++;
                       ret = T_POSIX_CHARCLASS_START;
                       break;
  @@ -567,10 +581,14 @@
       Token processBacksolidus_I() throws ParseException {
           throw ex("parser.process.1", this.offset);
       }
  -    Token processBacksolidus_X() throws ParseException {
  +    Token processBacksolidus_g() throws ParseException {
           this.next();
           return Token.getGraphemePattern();
       }
  +    Token processBacksolidus_X() throws ParseException {
  +        this.next();
  +        return Token.getCombiningCharacterSequence();
  +    }
       Token processBackreference() throws ParseException {
           int refnum = this.chardata-'0';
           Token tok = Token.createBackReference(refnum);
  @@ -737,6 +755,7 @@
                 case 'C': return this.processBacksolidus_C();
                 case 'i': return this.processBacksolidus_i();
                 case 'I': return this.processBacksolidus_I();
  +              case 'g': return this.processBacksolidus_g();
                 case 'X': return this.processBacksolidus_X();
                 case '1':  case '2':  case '3':  case '4':
                 case '5':  case '6':  case '7':  case '8':  case '9':
  @@ -744,43 +763,9 @@
   
                 case 'P':
                 case 'p':
  -                boolean positive = this.chardata == 'p';
                   int pstart = this.offset;
  -                this.next();
  -                if (this.read() != T_CHAR)  throw ex("parser.atom.2", this.offset-1);
  -                switch (this.chardata) {
  -                  case 'L':                     // Letter
  -                    tok = Token.getRange("L", positive);  break;
  -                  case 'M':                     // Mark
  -                    tok = Token.getRange("M", positive);  break;
  -                  case 'N':                     // Number
  -                    tok = Token.getRange("N", positive);  break;
  -                  case 'Z':                     // Separator
  -                    tok = Token.getRange("Z", positive);  break;
  -                  case 'C':                     // Other
  -                    tok = Token.getRange("C", positive);  break;
  -                  case 'P':                     // Punctuation
  -                    tok = Token.getRange("P", positive);  break;
  -                  case 'S':                     // Symbol
  -                    tok = Token.getRange("S", positive);  break;
  -                  case '{':
  -                    // this.offset points the next of '{'.
  -                    pstart = this.offset;
  -                    int namestart = this.offset;
  -                    int nameend = this.regex.indexOf('}', namestart);
  -                    if (nameend < 0)  throw ex("parser.atom.3", this.offset);
  -                    this.offset = nameend+1;
  -                    tok = Token.getRange(this.regex.substring(namestart, nameend), positive);
  -                    /*
  -                    if (this.isSet(RegularExpression.IGNORE_CASE))
  -                        tok = RangeToken.createCaseInsensitiveToken(tok);
  -                    */
  -                    break;
  -
  -                  default:
  -                    throw ex("parser.atom.2", this.offset-1);
  -                }
  -                if (tok == null)  throw ex("parser.atom.5", pstart);
  +                tok = processBacksolidus_pP(this.chardata);
  +                if (tok == null)  throw this.ex("parser.atom.5", pstart);
                   break;
   
                 default:
  @@ -794,8 +779,48 @@
               this.next();
               break;
   
  +          default:
  +            throw this.ex("parser.atom.4", this.offset-1);
  +        }
  +        return tok;
  +    }
  +
  +    protected RangeToken processBacksolidus_pP(int c) throws ParseException {
  +        boolean positive = c == 'p';
  +        this.next();
  +        if (this.read() != T_CHAR)  throw this.ex("parser.atom.2", this.offset-1);
  +        RangeToken tok;
  +        switch (this.chardata) {
  +          case 'L':                             // Letter
  +            tok = Token.getRange("L", positive);  break;
  +          case 'M':                             // Mark
  +            tok = Token.getRange("M", positive);  break;
  +          case 'N':                             // Number
  +            tok = Token.getRange("N", positive);  break;
  +          case 'Z':                             // Separator
  +            tok = Token.getRange("Z", positive);  break;
  +          case 'C':                             // Other
  +            tok = Token.getRange("C", positive);  break;
  +          case 'P':                             // Punctuation
  +            tok = Token.getRange("P", positive);  break;
  +          case 'S':                             // Symbol
  +            tok = Token.getRange("S", positive);  break;
  +          case '{':
  +            // this.offset points the next of '{'.
  +            //pstart = this.offset;
  +            int namestart = this.offset;
  +            int nameend = this.regex.indexOf('}', namestart);
  +            if (nameend < 0)  throw this.ex("parser.atom.3", this.offset);
  +            this.offset = nameend+1;
  +            tok = Token.getRange(this.regex.substring(namestart, nameend), positive);
  +            /*
  +              if (this.isSet(RegularExpression.IGNORE_CASE))
  +              tok = RangeToken.createCaseInsensitiveToken(tok);
  +            */
  +            break;
  +
             default:
  -            throw ex("parser.atom.4", this.offset-1);
  +            throw this.ex("parser.atom.2", this.offset-1);
           }
           return tok;
       }
  @@ -811,7 +836,121 @@
        * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | bmp-code | character-2
        * bmp-code ::= '\' 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]
        */
  -    private RangeToken parseCharacterClass(boolean useNrange) throws ParseException {
  +    protected RangeToken parseCharacterClass(boolean useNrange) throws ParseException {
  +        this.setContext(S_INBRACKETS);
  +        this.next();                            // '['
  +        boolean nrange = false;
  +        RangeToken base = null;
  +        RangeToken tok;
  +        if (this.read() == T_CHAR && this.chardata == '^') {
  +            nrange = true;
  +            this.next();                        // '^'
  +            if (useNrange) {
  +                tok = Token.createNRange();
  +            } else {
  +                base = Token.createRange();
  +                base.addRange(0, Token.UTF16_MAX);
  +                tok = Token.createRange();
  +            }
  +        } else {
  +            tok = Token.createRange();
  +        }
  +        int type;
  +        boolean firstloop = true;
  +        while ((type = this.read()) != T_EOF) {
  +            if (type == T_CHAR && this.chardata == ']' && !firstloop)
  +                break;
  +            firstloop = false;
  +            int c = this.chardata;
  +            boolean end = false;
  +            if (type == T_BACKSOLIDUS) {
  +                switch (c) {
  +                  case 'd':  case 'D':
  +                  case 'w':  case 'W':
  +                  case 's':  case 'S':
  +                    tok.mergeRanges(this.getTokenForShorthand(c));
  +                    end = true;
  +                    break;
  +
  +                  case 'i':  case 'I':
  +                  case 'c':  case 'C':
  +                    c = this.processCIinCharacterClass(tok, c);
  +                    if (c < 0)  end = true;
  +                    break;
  +                    
  +                  case 'p':
  +                  case 'P':
  +                    int pstart = this.offset;
  +                    RangeToken tok2 = this.processBacksolidus_pP(c);
  +                    if (tok2 == null)  throw this.ex("parser.atom.5", pstart);
  +                    tok.mergeRanges(tok2);
  +                    end = true;
  +                    break;
  +
  +                  default:
  +                    c = this.decodeEscaped();
  +                } // \ + c
  +            } // backsolidus
  +                                                // POSIX Character class such as [:alnum:]
  +            else if (type == T_POSIX_CHARCLASS_START) {
  +                int nameend = this.regex.indexOf(':', this.offset);
  +                if (nameend < 0) throw this.ex("parser.cc.1", this.offset);
  +                boolean positive = true;
  +                if (this.regex.charAt(this.offset) == '^') {
  +                    this.offset ++;
  +                    positive = false;
  +                }
  +                String name = this.regex.substring(this.offset, nameend);
  +                RangeToken range = Token.getRange(name, positive);
  +                if (range == null)  throw this.ex("parser.cc.3", this.offset);
  +                tok.mergeRanges(range);
  +                end = true;
  +                if (nameend+1 >= this.regexlen || this.regex.charAt(nameend+1) != ']')
  +                    throw this.ex("parser.cc.1", nameend);
  +                this.offset = nameend+2;
  +            }
  +            this.next();
  +            if (!end) {                         // if not shorthands...
  +                if (this.read() != T_CHAR || this.chardata != '-') { // Here is no '-'.
  +                    tok.addRange(c, c);
  +                } else {
  +                    this.next(); // Skips '-'
  +                    if ((type = this.read()) == T_EOF)  throw this.ex("parser.cc.2", this.offset);
  +                    if (type == T_CHAR && this.chardata == ']') {
  +                        tok.addRange(c, c);
  +                        tok.addRange('-', '-');
  +                    } else {
  +                        int rangeend = this.chardata;
  +                        if (type == T_BACKSOLIDUS)
  +                            rangeend = this.decodeEscaped();
  +                        this.next();
  +                        tok.addRange(c, rangeend);
  +                    }
  +                }
  +            }
  +            if (this.isSet(RegularExpression.SPECIAL_COMMA)
  +                && this.read() == T_CHAR && this.chardata == ',')
  +                this.next();
  +        }
  +        if (this.read() == T_EOF)
  +            throw this.ex("parser.cc.2", this.offset);
  +        if (!useNrange && nrange) {
  +            base.subtractRanges(tok);
  +            tok = base;
  +        }
  +        tok.sortRanges();
  +        tok.compactRanges();
  +        //tok.dumpRanges();
  +        /*
  +        if (this.isSet(RegularExpression.IGNORE_CASE))
  +            tok = RangeToken.createCaseInsensitiveToken(tok);
  +        */
  +        this.setContext(S_NORMAL);
  +        this.next();                    // Skips ']'
  +
  +        return tok;
  +    }
  +    private RangeToken parseCharacterClass_old(boolean useNrange) throws ParseException {
           this.setContext(S_INBRACKETS);
           this.next();                            // '['
           boolean nrange = false;
  @@ -824,7 +963,7 @@
                   tok = Token.createNRange();
               } else {
                   base = Token.createRange();
  -                base.addRange(0, 0xffff);
  +                base.addRange(0, Token.UTF16_MAX);
                   tok = Token.createRange();
               }
           } else {
  @@ -962,7 +1101,7 @@
       /**
        * '(?[' ... ']' (('-' | '+' | '&') '[' ... ']')? ')'
        */
  -    private RangeToken parseSetOperations() throws ParseException {
  +    protected RangeToken parseSetOperations() throws ParseException {
           RangeToken tok = this.parseCharacterClass(false);
           int type;
           while ((type = this.read()) != T_RPAREN) {
  
  
  
  1.3       +157 -125  xml-xerces/java/src/org/apache/xerces/utils/regex/RegularExpression.java
  
  Index: RegularExpression.java
  ===================================================================
  RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/utils/regex/RegularExpression.java,v
  retrieving revision 1.2
  retrieving revision 1.3
  diff -u -r1.2 -r1.3
  --- RegularExpression.java	2000/05/17 18:32:55	1.2
  +++ RegularExpression.java	2000/06/20 23:57:18	1.3
  @@ -85,13 +85,6 @@
    * }
    * </pre>
    *
  - *<!--
  - *   <dt>C. Easy way
  - * <pre>
  - * if (RegularExpression.matches(<var>regex</var>, text) >= 0) { ... }
  - * </pre>
  - *-->
  - *
    * </dl>
    *
    * <h4>Case-insensitive matching</h4>
  @@ -119,16 +112,16 @@
    *      'Unicode Regular Expression Guidelines' Revision 4.
    *      When "w" and "u" are specified at the same time,
    *      <kbd>\b \B \&lt; \></kbd> are processed for the "w" option.
  + *   <dt><a name="COMMA_OPTION"><code>","</code></a>
  + *   <dd>The parser treats a comma in a character class as a range separator.
  + *      <kbd class="REGEX">[a,b]</kbd> matches <kbd>a</kbd> or <kbd>,</kbd> or <kbd>b</kbd> without this option.
  + *      <kbd class="REGEX">[a,b]</kbd> matches <kbd>a</kbd> or <kbd>b</kbd> with this option.
    *
    *   <dt><a name="X_OPTION"><code>"X"</code></a>
  - *   <dd class="REGEX"><!--<a href="http://www.w3.org/TR/xmlschema-2/#regexs">XML Schema</a> mode.-->
  - *       By this option, the engine confoms to <a href="http://www.w3.org/TR/1999/WD-xmlschema-2-19991217/#regexs">XML Schema: Regular Expression</a>.
  + *   <dd class="REGEX">
  + *       By this option, the engine confoms to <a href="http://www.w3.org/TR/2000/WD-xmlschema-2-20000407/#regexs">XML Schema: Regular Expression</a>.
    *       The <code>match()</code> method does not do subsring matching
    *       but entire string matching.
  - *       <dl>
  - *         <dt>NOT IMPLEMNTED FEATURES:
  - *         <dd>Character class subtraction
  - *       </dl>
    *
    * </dl>
    * 
  @@ -139,15 +132,13 @@
    *    <td>
    *     <h4>Differences from the Perl 5 regular expression</h4>
    *     <ul>
  - *      <li><kbd>,</kbd> is a special character in <kbd>[]</kbd>.
    *      <li>There is 6-digit hexadecimal character representation  (<kbd>\u005cv</kbd><var>HHHHHH</var>.)
  - *      <li><kbd>\X</kbd> has different meaning.
    *      <li>Supports subtraction, union, and intersection operations for character classes.
    *      <li>Not supported: <kbd>\</kbd><var>ooo</var> (Octal character representations),
    *          <Kbd>\G</kbd>, <kbd>\C</kbd>, <kbd>\l</kbd><var>c</var>,
    *          <kbd>\u005cu</kbd><var>c</var>, <kbd>\L</kbd>, <kbd>\U</kbd>,
  - *          <kbd>\E</kbd>, <kbd>\Q</kbd>,
  - *          <Kbd>(?{<kbd><var>code</var><kbd>})</kbd>, <Kbd>(?p{<kbd><var>code</var><kbd>})</kbd>
  + *          <kbd>\E</kbd>, <kbd>\Q</kbd>, <kbd>\N{</kbd><var>name</var><kbd>}</kbd>,
  + *          <Kbd>(?{<kbd><var>code</var><kbd>})</kbd>, <Kbd>(??{<kbd><var>code</var><kbd>})</kbd>
    *     </ul>
    *    </td>
    *   </tr>
  @@ -197,16 +188,20 @@
    *       <dt class="REGEX"><kbd>\u005cv</kbd><var>HHHHHH</var>
    *       <dd>Matches a character of which code point is <var>HHHHHH</var> (Hexadecimal) in Unicode.
    *
  - *       <dt class="REGEX"><kbd>\X</kbd>
  + *       <dt class="REGEX"><kbd>\g</kbd>
    *       <dd>Matches a grapheme.
  - *       <dd class="REGEX">It is equivalent to <kbd>(?[\p{ASSIGNED}]-[\p{M},\p{C}])?(?:\p{M}|[\x{094D},\x{09CD},\x{0A4D},\x{0ACD},\x{0B3D},\x{0BCD},\x{0C4D},\x{0CCD},\x{0D4D},\x{0E3A},\x{0F84}]\p{L}|[\x{1160}-\x{11A7}]|[\x{11A8}-\x{11FF}]|[\x{FF9E},\x{FF9F}])*</kbd>
  - *       <dd class="WARNING"><Kbd>\X</kbd> in Perl 5.6 means <kbd>\P{M}\p{M}*</kbd>.</dd>
  + *       <dd class="REGEX">It is equivalent to <kbd>(?[\p{ASSIGNED}]-[\p{M}\p{C}])?(?:\p{M}|[\x{094D}\x{09CD}\x{0A4D}\x{0ACD}\x{0B3D}\x{0BCD}\x{0C4D}\x{0CCD}\x{0D4D}\x{0E3A}\x{0F84}]\p{L}|[\x{1160}-\x{11A7}]|[\x{11A8}-\x{11FF}]|[\x{FF9E}\x{FF9F}])*</kbd>
  + *
  + *       <dt class="REGEX"><kbd>\X</kbd>
  + *       <dd class="REGEX">Matches a combining character sequence.
  + *       It is equivalent to <kbd>(?:\PM\pM*)</kbd>
    *     </dl>
    *   </li>
    *
    *   <li>Character class
    *     <dl>
  - *       <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd>
  ++ *       <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub></var><var>R<sub>2</sub></var><var>...</var><var>R<sub>n</sub></var><kbd>]</kbd> (without <a href="#COMMA_OPTION">"," option</a>)
  ++ *       <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd> (with <a href="#COMMA_OPTION">"," option</a>)
    *       <dd>Positive character class.  It matches a character in ranges.
    *       <dd><var>R<sub>n</sub></var>:
    *       <ul>
  @@ -214,61 +209,65 @@
    *             <p>This range matches the character.
    *         <li class="REGEX"><var>C<sub>1</sub></var><kbd>-</kbd><var>C<sub>2</sub></var>
    *             <p>This range matches a character which has a code point that is >= <var>C<sub>1</sub></var>'s code point and &lt;= <var>C<sub>2</sub></var>'s code point.
  - *         <li class="REGEX">A POSIX character class: <Kbd>[:alpha:] [:alnum:] [:ascii:] [:cntrl:] [:digit:] [:graph:] [:lower:] [:print:] [:punct:] [:space:] [:upper:] [:xdigit:]</kbd>
  ++ *         <li class="REGEX">A POSIX character class: <Kbd>[:alpha:] [:alnum:] [:ascii:] [:cntrl:] [:digit:] [:graph:] [:lower:] [:print:] [:punct:] [:space:] [:upper:] [:xdigit:]</kbd>,
  ++ *             and negative POSIX character classes in Perl like <kbd>[:^alpha:]</kbd>
    *             <p>...
    *         <li class="REGEX"><kbd>\d \D \s \S \w \W \p{</kbd><var>name</var><kbd>} \P{</kbd><var>name</var><kbd>}</kbd>
    *             <p>These expressions specifies the same ranges as the following expressions.
    *       </ul>
    *       <p class="REGEX">Enumerated ranges are merged (union operation).
  - *          <kbd>[a-e,c-z]</kbd> is equivalent to <kbd>[a-z]</kbd>
  + *          <kbd>[a-ec-z]</kbd> is equivalent to <kbd>[a-z]</kbd>
    *
  - *       <dt class="REGEX"><kbd>[^</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd>
  + *       <dt class="REGEX"><kbd>[^</kbd><var>R<sub>1</sub></var><var>R<sub>2</sub></var><var>...</var><var>R<sub>n</sub></var><kbd>]</kbd> (without a <a href="#COMMA_OPTION">"," option</a>)
  + *       <dt class="REGEX"><kbd>[^</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var><kbd>]</kbd> (with a <a href="#COMMA_OPTION">"," option</a>)
    *       <dd>Negative character class.  It matches a character not in ranges.
    *
    *       <dt class="REGEX"><kbd>(?[</kbd><var>ranges</var><kbd>]</kbd><var>op</var><kbd>[</kbd><var>ranges</var><kbd>]</kbd><var>op</var><kbd>[</kbd><var>ranges</var><kbd>]</kbd> ... <Kbd>)</kbd>
  - *       (<var>op</var> is <kbd>-</kbd> or <kbd>+</kbd> or <kbd>&</kbd>.
  - *       <var>ranges</var> is <var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var> or <kbd>^</kbd><var>R<sub>1</sub></var><kbd>,</kbd><var>R<sub>2</sub></var><kbd>,</kbd><var>...</var><kbd>,</kbd><var>R<sub>n</sub></var>.)
  - *       <dd class="WARNING">This feature is highly experimental.</dd>
  + *       (<var>op</var> is <kbd>-</kbd> or <kbd>+</kbd> or <kbd>&</kbd>.)
    *       <dd>Subtraction or union or intersection for character classes.
  - *       <dd class="REGEX">For exmaple, <kbd>(?[A-Z]-[C,F])</kbd> is equivalent to <kbd>[A-B,D-E,G-Z]</kbd>, and <kbd>(?[0x00-0x7f]-[K]&[\p{Lu}])</kbd> is equivalent to <kbd>[A-J,L-Z]</kbd>.
  + *       <dd class="REGEX">For exmaple, <kbd>(?[A-Z]-[CF])</kbd> is equivalent to <kbd>[A-BD-EG-Z]</kbd>, and <kbd>(?[0x00-0x7f]-[K]&[\p{Lu}])</kbd> is equivalent to <kbd>[A-JL-Z]</kbd>.
    *       <dd>The result of this operations is a <u>positive character class</u>
    *           even if an expression includes any negative character classes.
    *           You have to take care on this in case-insensitive matching.
  - *           For instance, <kbd>(?[^b])</kbd> is equivalent to <kbd>[\x00-a,c-\x{10ffff}]</kbd>,
  + *           For instance, <kbd>(?[^b])</kbd> is equivalent to <kbd>[\x00-ac-\x{10ffff}]</kbd>,
    *           which is equivalent to <kbd>[^b]</kbd> in case-sensitive matching.
    *           But, in case-insensitive matching, <kbd>(?[^b])</kbd> matches any character because
    *           it includes '<kbd>B</kbd>' and '<kbd>B</kbd>' matches '<kbd>b</kbd>'
  - *           though <kbd>[^b]</kbd> is processed as <kbd>[^B,b]</kbd>.
  + *           though <kbd>[^b]</kbd> is processed as <kbd>[^Bb]</kbd>.
    *
  + *       <dt class="REGEX"><kbd>[</kbd><var>R<sub>1</sub>R<sub>2</sub>...</var><kbd>-[</kbd><var>R<sub>n</sub>R<sub>n+1</sub>...</var><kbd>]]</kbd> (with an <a href="#X_OPTION">"X" option</a>)</dt>
  + *       <dd>Character class subtraction for the XML Schema.
  + *           You can use this syntax when you specify an <a href="#X_OPTION">"X" option</a>.
  + *           
    *       <dt class="REGEX"><kbd>\d</kbd>
    *       <dd class="REGEX">Equivalent to <kbd>[0-9]</kbd>.
  - *       <dd>When <a href="#U_OPTION">the "u" option</a> is set, it is equivalent to
  + *       <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
    *           <span class="REGEX"><kbd>\p{Nd}</kbd></span>.
    *
    *       <dt class="REGEX"><kbd>\D</kbd>
    *       <dd class="REGEX">Equivalent to <kbd>[^0-9]</kbd>
  - *       <dd>When <a href="#U_OPTION">the "u" option</a> is set, it is equivalent to
  + *       <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
    *           <span class="REGEX"><kbd>\P{Nd}</kbd></span>.
    *
    *       <dt class="REGEX"><kbd>\s</kbd>
  - *       <dd class="REGEX">Equivalent to <kbd>[ ,\f,\n,\r,\t]</kbd>
  - *       <dd>When <a href="#U_OPTION">the "u" option</a> is set, it is equivalent to
  - *           <span class="REGEX"><kbd>[ ,\f,\n,\r,\t,\p{Z}]</kbd></span>.
  + *       <dd class="REGEX">Equivalent to <kbd>[ \f\n\r\t]</kbd>
  + *       <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
  + *           <span class="REGEX"><kbd>[ \f\n\r\t\p{Z}]</kbd></span>.
    *
    *       <dt class="REGEX"><kbd>\S</kbd>
  - *       <dd class="REGEX">Equivalent to <kbd>[^ ,\f,\n,\r,\t]</kbd>
  - *       <dd>When <a href="#U_OPTION">the "u" option</a> is set, it is equivalent to
  - *           <span class="REGEX"><kbd>[^ ,\f,\n,\r,\t,\p{Z}]</kbd></span>.
  + *       <dd class="REGEX">Equivalent to <kbd>[^ \f\n\r\t]</kbd>
  + *       <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
  + *           <span class="REGEX"><kbd>[^ \f\n\r\t\p{Z}]</kbd></span>.
    *
    *       <dt class="REGEX"><kbd>\w</kbd>
  - *       <dd class="REGEX">Equivalent to <kbd>[a-z,A-Z,0-9,_]</kbd>
  - *       <dd>When <a href="#U_OPTION">the "u" option</a> is set, it is equivalent to
  - *           <span class="REGEX"><kbd>[\p{Lu},\p{Ll},\p{Lo},\p{Nd},_]</kbd></span>.
  + *       <dd class="REGEX">Equivalent to <kbd>[a-zA-Z0-9_]</kbd>
  + *       <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
  + *           <span class="REGEX"><kbd>[\p{Lu}\p{Ll}\p{Lo}\p{Nd}_]</kbd></span>.
    *
    *       <dt class="REGEX"><kbd>\W</kbd>
  - *       <dd class="REGEX">Equivalent to <kbd>[^a-z,A-Z,0-9,_]</kbd>
  - *       <dd>When <a href="#U_OPTION">the "u" option</a> is set, it is equivalent to
  - *           <span class="REGEX"><kbd>[^\p{Lu},\p{Ll},\p{Lo},\p{Nd},_]</kbd></span>.
  + *       <dd class="REGEX">Equivalent to <kbd>[^a-zA-Z0-9_]</kbd>
  + *       <dd>When <a href="#U_OPTION">a "u" option</a> is set, it is equivalent to
  + *           <span class="REGEX"><kbd>[^\p{Lu}\p{Ll}\p{Lo}\p{Nd}_]</kbd></span>.
    *
    *       <dt class="REGEX"><kbd>\p{</kbd><var>name</var><kbd>}</kbd>
    *       <dd>Matches one character in the specified General Category (the second field in <a href="ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt"><kbd>UnicodeData.txt</kbd></a>) or the specified <a href="ftp://ftp.unicode.org/Public/UNIDATA/Blocks.txt">Block</a>.
  @@ -403,7 +402,7 @@
    *
    *       <dt class="REGEX"><kbd>^</kbd>
    *       <dd>Matches the beginning of the text.  It is equivalent to <span class="REGEX"><Kbd>\A</kbd></span>.
  - *       <dd>When <a href="#M_OPTION">the "m" option</a> is set,
  + *       <dd>When <a href="#M_OPTION">a "m" option</a> is set,
    *           it matches the beginning of the text, or after one of EOL characters (
    *           LINE FEED (U+000A), CARRIAGE RETURN (U+000D), LINE SEPARATOR (U+2028),
    *           PARAGRAPH SEPARATOR (U+2029).)
  @@ -411,24 +410,24 @@
    *       <dt class="REGEX"><kbd>$</kbd>
    *       <dd>Matches the end of the text, or before an EOL character at the end of the text,
    *           or CARRIAGE RETURN + LINE FEED at the end of the text.
  - *       <dd>When <a href="#M_OPTION">the "m" option</a> is set,
  + *       <dd>When <a href="#M_OPTION">a "m" option</a> is set,
    *           it matches the end of the text, or before an EOL character.
    *
    *       <dt class="REGEX"><kbd>\b</kbd>
    *       <dd>Matches word boundary.
  - *           (See <a href="#W_OPTION">the "w" option</a>)
  + *           (See <a href="#W_OPTION">a "w" option</a>)
    *
    *       <dt class="REGEX"><kbd>\B</kbd>
    *       <dd>Matches non word boundary.
  - *           (See <a href="#W_OPTION">the "w" option</a>)
  + *           (See <a href="#W_OPTION">a "w" option</a>)
    *
    *       <dt class="REGEX"><kbd>\&lt;</kbd>
    *       <dd>Matches the beginning of a word.
  - *           (See <a href="#W_OPTION">the "w" option</a>)
  + *           (See <a href="#W_OPTION">a "w" option</a>)
    *
    *       <dt class="REGEX"><kbd>\&gt;</kbd>
    *       <dd>Matches the end of a word.
  - *           (See <a href="#W_OPTION">the "w" option</a>)
  + *           (See <a href="#W_OPTION">a "w" option</a>)
    *     </dl>
    *   </li>
    *   <li>Lookahead and lookbehind
  @@ -493,7 +492,7 @@
    *
    * char-class ::= '[' ranges ']'
    *                | '(?[' ranges ']' ([-+&] '[' ranges ']')? ')'
  - * ranges ::= '^'? (range ','?)+
  + * ranges ::= '^'? (range <a href="#COMMA_OPTION">','?</a>)+
    * range ::= '\d' | '\w' | '\s' | '\D' | '\W' | '\S' | category-block
    *           | range-char | range-char '-' range-char
    * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | code-point | character-2
  @@ -523,6 +522,9 @@
   public class RegularExpression implements java.io.Serializable {
       static final boolean DEBUG = false;
   
  +    /**
  +     * Compiles a token tree into an operation flow.
  +     */
       private synchronized void compile(Token tok) {
           if (this.operations != null)
               return;
  @@ -530,6 +532,9 @@
           this.operations = this.compile(tok, null, false);
       }
   
  +    /**
  +     * Converts a token to an operation.
  +     */
       private Op compile(Token tok, Op next, boolean reverse) {
           Op ret;
           switch (tok.type) {
  @@ -688,39 +693,47 @@
   
   //Public
   
  -/**
  - *
  - * @return true if the target is matched to this regular expression.
  - */
  +    /**
  +     * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
  +     *
  +     * @return true if the target is matched to this regular expression.
  +     */
       public boolean matches(char[]  target) {
           return this.matches(target, 0,  target .length , (Match)null);
       }
   
  -/**
  - *
  - * @return true if the target is matched to this regular expression.
  - */
  +    /**
  +     * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
  +     * in specified range or not.
  +     *
  +     * @param start Start offset of the range.
  +     * @param end  End offset +1 of the range.
  +     * @return true if the target is matched to this regular expression.
  +     */
       public boolean matches(char[]  target, int start, int end) {
           return this.matches(target, start, end, (Match)null);
       }
   
  -/**
  - *
  - * @param match A Match instance for storing matching result.
  - * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
  - */
  +    /**
  +     * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
  +     *
  +     * @param match A Match instance for storing matching result.
  +     * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
  +     */
       public boolean matches(char[]  target, Match match) {
           return this.matches(target, 0,  target .length , match);
       }
   
   
  -/**
  - *
  - * @param match A Match instance for storing matching result.
  - * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
  - */
  -
  -
  +    /**
  +     * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
  +     * in specified range or not.
  +     *
  +     * @param start Start offset of the range.
  +     * @param end  End offset +1 of the range.
  +     * @param match A Match instance for storing matching result.
  +     * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
  +     */
       public boolean matches(char[]  target, int start, int end, Match match) {
   
           synchronized (this) {
  @@ -799,7 +812,8 @@
           /*
            * Checks whether the expression starts with ".*".
            */
  -        if (this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) {
  +        if (this.operations != null
  +            && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) {
               if (isSet(this.options, SINGLE_LINE)) {
                   matchStart = con.start;
                   matchEnd = this. matchCharArray (con, this.operations, con.start, 1, this.options);
  @@ -1387,42 +1401,46 @@
   
   
   
  -/**
  - *
  - * @return true if the target is matched to this regular expression.
  - */
  +    /**
  +     * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
  +     *
  +     * @return true if the target is matched to this regular expression.
  +     */
       public boolean matches(String  target) {
           return this.matches(target, 0,  target .length() , (Match)null);
       }
   
  -/**
  - *
  - * @return true if the target is matched to this regular expression.
  - */
  +    /**
  +     * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
  +     * in specified range or not.
  +     *
  +     * @param start Start offset of the range.
  +     * @param end  End offset +1 of the range.
  +     * @return true if the target is matched to this regular expression.
  +     */
       public boolean matches(String  target, int start, int end) {
           return this.matches(target, start, end, (Match)null);
       }
   
  -/**
  - *
  - * @param match A Match instance for storing matching result.
  - * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
  - */
  +    /**
  +     * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
  +     *
  +     * @param match A Match instance for storing matching result.
  +     * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
  +     */
       public boolean matches(String  target, Match match) {
           return this.matches(target, 0,  target .length() , match);
       }
  -
  -
  -/**
  - *
  - * @param match A Match instance for storing matching result.
  - * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
  - */
  -
  -
  -
   
  -
  +    /**
  +     * Checks whether the <var>target</var> text <strong>contains</strong> this pattern
  +     * in specified range or not.
  +     *
  +     * @param start Start offset of the range.
  +     * @param end  End offset +1 of the range.
  +     * @param match A Match instance for storing matching result.
  +     * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
  +     */
       public boolean matches(String  target, int start, int end, Match match) {
   
           synchronized (this) {
  @@ -1501,7 +1519,8 @@
           /*
            * Checks whether the expression starts with ".*".
            */
  -        if (this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) {
  +        if (this.operations != null
  +            && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) {
               if (isSet(this.options, SINGLE_LINE)) {
                   matchStart = con.start;
                   matchEnd = this. matchString (con, this.operations, con.start, 1, this.options);
  @@ -1584,9 +1603,9 @@
           }
       }
   
  -/**
  - * @return -1 when not match; offset of the end of matched string when match.
  - */
  +    /**
  +     * @return -1 when not match; offset of the end of matched string when match.
  +     */
       private int matchString (Context con, Op op, int offset, int dx, int opts) {
   
   
  @@ -2048,21 +2067,22 @@
   
   
   
  -/**
  - *
  - * @return true if the target is matched to this regular expression.
  - */
  +    /**
  +     * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
  +     *
  +     * @return true if the target is matched to this regular expression.
  +     */
       public boolean matches(CharacterIterator target) {
           return this.matches(target, (Match)null);
       }
   
  -
  -/**
  - *
  - * @param match A Match instance for storing matching result.
  - * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
  - */
   
  +    /**
  +     * Checks whether the <var>target</var> text <strong>contains</strong> this pattern or not.
  +     *
  +     * @param match A Match instance for storing matching result.
  +     * @return Offset of the start position in <VAR>target</VAR>; or -1 if not match.
  +     */
       public boolean matches(CharacterIterator  target, Match match) {
           int start = target.getBeginIndex();
           int end = target.getEndIndex();
  @@ -2145,7 +2165,8 @@
           /*
            * Checks whether the expression starts with ".*".
            */
  -        if (this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) {
  +        if (this.operations != null
  +            && this.operations.type == Op.CLOSURE && this.operations.getChild().type == Op.DOT) {
               if (isSet(this.options, SINGLE_LINE)) {
                   matchStart = con.start;
                   matchEnd = this. matchCharacterIterator (con, this.operations, con.start, 1, this.options);
  @@ -2228,9 +2249,9 @@
           }
       }
   
  -/**
  - * @return -1 when not match; offset of the end of matched string when match.
  - */
  +    /**
  +     * @return -1 when not match; offset of the end of matched string when match.
  +     */
       private int matchCharacterIterator (Context con, Op op, int offset, int dx, int opts) {
   
   
  @@ -2690,9 +2711,9 @@
           return true;
       }
   
  -/**
  - * @see java.lang.String#regionMatches
  - */
  +    /**
  +     * @see java.lang.String#regionMatches
  +     */
       private static final boolean regionMatchesIgnoreCase(CharacterIterator  target, int offset, int limit,
                                                            String part, int partlen) {
           if (offset < 0)  return false;
  @@ -2817,6 +2838,9 @@
           }
       }
   
  +    /**
  +     * Prepares for matching.  This method is called just before starting matching.
  +     */
       void prepare() {
           if (Op.COUNT)  Op.nofinstances = 0;
           this.compile(this.tokentree);
  @@ -2844,7 +2868,8 @@
               }
           }
   
  -        if ((this.operations.type == Op.STRING || this.operations.type == Op.CHAR)
  +        if (this.operations != null
  +            && (this.operations.type == Op.STRING || this.operations.type == Op.CHAR)
               && this.operations.next == null) {
               if (DEBUG)
                   System.err.print(" *** Only fixed string! *** ");
  @@ -2950,14 +2975,18 @@
        * "X". XML Schema mode.
        */
       static final int XMLSCHEMA_MODE = 1<<9;
  +    /**
  +     * ",".
  +     */
  +    static final int SPECIAL_COMMA = 1<<10;
   
   
       private static final boolean isSet(int options, int flag) {
  -        return(options & flag) == flag;
  +        return (options & flag) == flag;
       }
   
       /**
  -     * Constructor.
  +     * Creates a new RegularExpression instance.
        *
        * @param regex A regular expression
        * @exception org.apache.xerces.utils.regex.ParseException <VAR>regex</VAR> is not conforming to the syntax.
  @@ -2967,10 +2996,10 @@
       }
   
       /**
  -     * Constructor.
  +     * Creates a new RegularExpression instance with options.
        *
        * @param regex A regular expression
  -     * @param options A String consisted of "i" "m" "s" "u" "w"
  +     * @param options A String consisted of "i" "m" "s" "u" "w" "," "X"
        * @exception org.apache.xerces.utils.regex.ParseException <VAR>regex</VAR> is not conforming to the syntax.
        */
       public RegularExpression(String regex, String options) throws ParseException {
  @@ -3022,10 +3051,13 @@
        * Represents this instence in String.
        */
       public String toString() {
  -        return this.tokentree.toString();
  +        return this.tokentree.toString(this.options);
       }
   
       /**
  +     * Returns a option string.
  +     * The order of letters in it may be different from a string specified
  +     * in a constructor or <code>setPattern()</code>.
        *
        * @see #RegularExpression(java.lang.String,java.lang.String)
        * @see #setPattern(java.lang.String,java.lang.String)
  @@ -3035,7 +3067,7 @@
       }
   
       /**
  -     *
  +     *  Return true if patterns are the same and the options are equivalent.
        */
       public boolean equals(Object obj) {
           if (obj == null)  return false;
  @@ -3053,7 +3085,7 @@
        *
        */
       public int hashCode() {
  -        return(this.regex+"/"+this.getOptions()).hashCode();
  +        return (this.regex+"/"+this.getOptions()).hashCode();
       }
   
       /**
  
  
  
  1.3       +5 -0      xml-xerces/java/src/org/apache/xerces/utils/regex/REUtil.java
  
  Index: REUtil.java
  ===================================================================
  RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/utils/regex/REUtil.java,v
  retrieving revision 1.2
  retrieving revision 1.3
  diff -u -r1.2 -r1.3
  --- REUtil.java	2000/05/17 18:32:54	1.2
  +++ REUtil.java	2000/06/20 23:57:18	1.3
  @@ -123,6 +123,9 @@
             case 'X':
               ret = RegularExpression.XMLSCHEMA_MODE;
               break;
  +          case ',':
  +            ret = RegularExpression.SPECIAL_COMMA;
  +            break;
             default:
           }
           return ret;
  @@ -160,6 +163,8 @@
               sb.append((char)'w');
           if ((options & RegularExpression.EXTENDED_COMMENT) != 0)
               sb.append((char)'x');
  +        if ((options & RegularExpression.SPECIAL_COMMA) != 0)
  +            sb.append((char)',');
           return sb.toString().intern();
       }
   
  
  
  
  1.3       +58 -40    xml-xerces/java/src/org/apache/xerces/utils/regex/Token.java
  
  Index: Token.java
  ===================================================================
  RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/utils/regex/Token.java,v
  retrieving revision 1.2
  retrieving revision 1.3
  diff -u -r1.2 -r1.3
  --- Token.java	2000/05/17 18:32:57	1.2
  +++ Token.java	2000/06/20 23:57:18	1.3
  @@ -276,6 +276,9 @@
       }
   
       public String toString() {
  +        return this.toString(0);
  +    }
  +    public String toString(int options) {
           return this.type == Token.DOT ? "." : "";
       }
   
  @@ -811,10 +814,11 @@
                       Token.categories.put(n, r1);
                       Token.categories2.put(n, Token.complementRanges(r1));
                       if (n.indexOf(' ') >= 0) {
  -                        StringBuffer buffer = new StringBuffer(n.length());
  +                        StringBuffer buffer = new StringBuffer(n.length()+2);
  +                        buffer.append("Is");
                           for (int ci = 0;  ci < n.length();  ci ++)
                               if (n.charAt(ci) != ' ')  buffer.append((char)n.charAt(ci));
  -                        Token.setAlias(buffer.toString(), n, true);
  +                        Token.setAlias(new String(buffer), n, true);
                       }
                   }
   
  @@ -961,6 +965,20 @@
           return Token.token_grapheme;
       }
   
  +    /**
  +     * Combing Character Sequence in Perl 5.6.
  +     */
  +    static private Token token_ccs = null;
  +    static synchronized protected Token getCombiningCharacterSequence() {
  +        if (Token.token_ccs != null)
  +            return Token.token_ccs;
  +
  +        Token foo = Token.createClosure(Token.getRange("M", true)); // \pM*
  +        foo = Token.createConcat(Token.getRange("M", false), foo); // \PM + \pM*
  +        Token.token_ccs = foo;
  +        return Token.token_ccs;
  +    }
  +
       // ------------------------------------------------------
   
       // ------------------------------------------------------
  @@ -984,7 +1002,7 @@
               return this.string;
           }
           
  -        public String toString() {
  +        public String toString(int options) {
               if (this.type == BACKREFERENCE)
                   return "\\"+this.refNumber;
               else
  @@ -1012,14 +1030,14 @@
               return index == 0 ? this.child : this.child2;
           }
   
  -        public String toString() {
  +        public String toString(int options) {
               String ret;
               if (this.child2.type == CLOSURE && this.child2.getChild(0) == this.child) {
  -                ret = this.child.toString()+"+";
  +                ret = this.child.toString(options)+"+";
               } else if (this.child2.type == NONGREEDYCLOSURE && this.child2.getChild(0) == this.child) {
  -                ret = this.child.toString()+"+?";
  +                ret = this.child.toString(options)+"+?";
               } else
  -                ret = this.child.toString()+this.child2.toString();
  +                ret = this.child.toString(options)+this.child2.toString(options);
               return ret;
           }
       }
  @@ -1039,7 +1057,7 @@
               return this.chardata;
           }
   
  -        public String toString() {
  +        public String toString(int options) {
               String ret;
               switch (this.type) {
                 case CHAR:
  @@ -1120,29 +1138,29 @@
               return this.max;
           }
   
  -        public String toString() {
  +        public String toString(int options) {
               String ret;
               if (this.type == CLOSURE) {
                   if (this.getMin() < 0 && this.getMax() < 0) {
  -                    ret = this.child.toString()+"*";
  +                    ret = this.child.toString(options)+"*";
                   } else if (this.getMin() == this.getMax()) {
  -                    ret = this.child.toString()+"{"+this.getMin()+"}";
  +                    ret = this.child.toString(options)+"{"+this.getMin()+"}";
                   } else if (this.getMin() >= 0 && this.getMax() >= 0) {
  -                    ret = this.child.toString()+"{"+this.getMin()+","+this.getMax()+"}";
  +                    ret = this.child.toString(options)+"{"+this.getMin()+","+this.getMax()+"}";
                   } else if (this.getMin() >= 0 && this.getMax() < 0) {
  -                    ret = this.child.toString()+"{"+this.getMin()+",}";
  +                    ret = this.child.toString(options)+"{"+this.getMin()+",}";
                   } else
                       throw new RuntimeException("Token#toString(): CLOSURE "
                                                  +this.getMin()+", "+this.getMax());
               } else {
                   if (this.getMin() < 0 && this.getMax() < 0) {
  -                    ret = this.child.toString()+"*?";
  +                    ret = this.child.toString(options)+"*?";
                   } else if (this.getMin() == this.getMax()) {
  -                    ret = this.child.toString()+"{"+this.getMin()+"}?";
  +                    ret = this.child.toString(options)+"{"+this.getMin()+"}?";
                   } else if (this.getMin() >= 0 && this.getMax() >= 0) {
  -                    ret = this.child.toString()+"{"+this.getMin()+","+this.getMax()+"}?";
  +                    ret = this.child.toString(options)+"{"+this.getMin()+","+this.getMax()+"}?";
                   } else if (this.getMin() >= 0 && this.getMax() < 0) {
  -                    ret = this.child.toString()+"{"+this.getMin()+",}?";
  +                    ret = this.child.toString(options)+"{"+this.getMin()+",}?";
                   } else
                       throw new RuntimeException("Token#toString(): NONGREEDYCLOSURE "
                                                  +this.getMin()+", "+this.getMax());
  @@ -1175,31 +1193,31 @@
               return this.parennumber;
           }
   
  -        public String toString() {
  +        public String toString(int options) {
               String ret = null;
               switch (this.type) {
                 case PAREN:
                   if (this.parennumber == 0) {
  -                    ret = "(?:"+this.child.toString()+")";
  +                    ret = "(?:"+this.child.toString(options)+")";
                   } else {
  -                    ret = "("+this.child.toString()+")";
  +                    ret = "("+this.child.toString(options)+")";
                   }
                   break;
   
                 case LOOKAHEAD:
  -                ret = "(?="+this.child.toString()+")";
  +                ret = "(?="+this.child.toString(options)+")";
                   break;
                 case NEGATIVELOOKAHEAD:
  -                ret = "(?!"+this.child.toString()+")";
  +                ret = "(?!"+this.child.toString(options)+")";
                   break;
                 case LOOKBEHIND:
  -                ret = "(?<="+this.child.toString()+")";
  +                ret = "(?<="+this.child.toString(options)+")";
                   break;
                 case NEGATIVELOOKBEHIND:
  -                ret = "(?<!"+this.child.toString()+")";
  +                ret = "(?<!"+this.child.toString(options)+")";
                   break;
                 case INDEPENDENT:
  -                ret = "(?>"+this.child.toString()+")";
  +                ret = "(?>"+this.child.toString(options)+")";
                   break;
               }
               return ret;
  @@ -1230,7 +1248,7 @@
               throw new RuntimeException("Internal Error: "+index);
           }
   
  -        public String toString() {
  +        public String toString(int options) {
               String ret;
               if (refNumber > 0) {
                   ret = "(?("+refNumber+")";
  @@ -1278,12 +1296,12 @@
               return this.mask;
           }
   
  -        public String toString() {
  +        public String toString(int options) {
               return "(?"
                   +(this.add == 0 ? "" : REUtil.createOptionString(this.add))
                   +(this.mask == 0 ? "" : REUtil.createOptionString(this.mask))
                   +":"
  -                +this.child.toString()
  +                +this.child.toString(options)
                   +")";
           }
       }
  @@ -1352,7 +1370,7 @@
                   buffer.append(tok.getString());
               }
   
  -            ((StringToken)previous).string = buffer.toString();
  +            ((StringToken)previous).string = new String(buffer);
           }
   
           int size() {
  @@ -1362,40 +1380,40 @@
               return (Token)this.children.elementAt(index);
           }
   
  -        public String toString() {
  +        public String toString(int options) {
               String ret;
               if (this.type == CONCAT) {
                   if (this.children.size() == 2) {
                       Token ch = this.getChild(0);
                       Token ch2 = this.getChild(1);
                       if (ch2.type == CLOSURE && ch2.getChild(0) == ch) {
  -                        ret = ch.toString()+"+";
  +                        ret = ch.toString(options)+"+";
                       } else if (ch2.type == NONGREEDYCLOSURE && ch2.getChild(0) == ch) {
  -                        ret = ch.toString()+"+?";
  +                        ret = ch.toString(options)+"+?";
                       } else
  -                        ret = ch.toString()+ch2.toString();
  +                        ret = ch.toString(options)+ch2.toString(options);
                   } else {
                       StringBuffer sb = new StringBuffer();
                       for (int i = 0;  i < this.children.size();  i ++) {
  -                        sb.append(this.children.elementAt(i).toString());
  +                        sb.append(((Token)this.children.elementAt(i)).toString(options));
                       }
  -                    ret = sb.toString();
  +                    ret = new String(sb);
                   }
                   return ret;
               }
               if (this.children.size() == 2 && this.getChild(1).type == EMPTY) {
  -                ret = this.getChild(0).toString()+"?";
  +                ret = this.getChild(0).toString(options)+"?";
               } else if (this.children.size() == 2
                          && this.getChild(0).type == EMPTY) {
  -                ret = this.getChild(1).toString()+"??";
  +                ret = this.getChild(1).toString(options)+"??";
               } else {
                   StringBuffer sb = new StringBuffer();
  -                sb.append(this.children.elementAt(0).toString());
  +                sb.append(((Token)this.children.elementAt(0)).toString(options));
                   for (int i = 1;  i < this.children.size();  i ++) {
                       sb.append((char)'|');
  -                    sb.append(this.children.elementAt(i).toString());
  +                    sb.append(((Token)this.children.elementAt(i)).toString(options));
                   }
  -                ret = sb.toString();
  +                ret = new String(sb);
               }
               return ret;
           }