You are viewing a plain text version of this content. The canonical link for it is here.
Posted to oro-dev@jakarta.apache.org by Takashi Okamoto <to...@rd.nttdata.co.jp> on 2001/01/09 02:30:10 UTC

[PATCH] for unicode problem (Re: New Regular Expression Package: JavaRegex2 )

From: "Daniel F. Savarese" <df...@savarese.org>
To: <or...@jakarta.apache.org>
Sent: Tuesday, January 02, 2001 3:32 PM
Subject: Re: New Regular Expression Package: JavaRegex2

>  However, other than the known unicode character class deficiency
> (for which there is a non-optimal, but working patch; and fixing it the
> "right" way is just a matter of someone with more time than myself to
> take the time to do it),

I have posted a unicode patch. But it has weakness which use
exhaust memory. This new patch doesn't so.
I write main topics about this patch:

 add new operator _ANYOFUN and _NANYOFUN at OpCode.java.
 add new method __parseUnicodeClass() at Perl5Compiler.
 add new method __matchUnicodeClass() at Perl5Matcher.

 If unicode is included between '[' and ']', expression will be parsed
 by __parseUnicodeClass().
 If unicode isn't included, expression will be parsed by
 __parseCharacterClass().

This way keeps enough performance (speed and memory).
( But I spent a lot of time:( )
How about this patch, daniel?

regards.
-------
Takashi Okamoto

------------------------
diff -crN orig/src/java/org/apache/oro/text/regex/OpCode.java
src/java/org/apache/oro/text/regex/OpCode.java
*** orig/src/java/org/apache/oro/text/regex/OpCode.java Sun Jan  7 14:10:40
2001
--- src/java/org/apache/oro/text/regex/OpCode.java Mon Jan  8 04:35:15 2001
***************
*** 107,125 ****
       _IFMATCH = 31,  // no       Succeeds if the following matches.
       _UNLESSM = 32,  // no       Fails if the following matches.
       _SUCCEED = 33,  // no       Return from a subroutine, basically.
!      _WHILEM  = 34;  // no       Do curly processing and see if rest
matches.

    // Lengths of the various operands.
    static final int _operandLength[] = {
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0,
!     0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0
    };

    static final char _opType[] = {
   _END, _BOL, _BOL, _BOL, _EOL, _EOL, _EOL, _ANY, _ANY, _ANYOF, _CURLY,
   _CURLY, _BRANCH, _BACK, _EXACTLY, _NOTHING, _STAR, _PLUS, _ALNUM,
   _NALNUM, _BOUND, _NBOUND, _SPACE, _NSPACE, _DIGIT, _NDIGIT, _REF,
!  _OPEN, _CLOSE, _MINMOD, _BOL, _BRANCH, _BRANCH, _END, _WHILEM
    };

    static final char _opLengthVaries[] = {
--- 107,129 ----
       _IFMATCH = 31,  // no       Succeeds if the following matches.
       _UNLESSM = 32,  // no       Fails if the following matches.
       _SUCCEED = 33,  // no       Return from a subroutine, basically.
!      _WHILEM  = 34,  // no       Do curly processing and see if rest
matches.
!      _ANYOFUN = 35,  // yes      Match unicode character in this class.
!      _NANYOFUN= 36,  // yes      Match unicode character not in this
class.
!      _RANGE   = 1;   // yes      Range flag in

    // Lengths of the various operands.
    static final int _operandLength[] = {
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0,
!     0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0
    };

    static final char _opType[] = {
   _END, _BOL, _BOL, _BOL, _EOL, _EOL, _EOL, _ANY, _ANY, _ANYOF, _CURLY,
   _CURLY, _BRANCH, _BACK, _EXACTLY, _NOTHING, _STAR, _PLUS, _ALNUM,
   _NALNUM, _BOUND, _NBOUND, _SPACE, _NSPACE, _DIGIT, _NDIGIT, _REF,
!  _OPEN, _CLOSE, _MINMOD, _BOL, _BRANCH, _BRANCH, _END, _WHILEM,
!  _ANYOFUN, _NANYOFUN
    };

    static final char _opLengthVaries[] = {
***************
*** 127,133 ****
    };

    static final char _opLengthOne[] = {
!     _ANY, _SANY, _ANYOF, _ALNUM, _NALNUM, _SPACE, _NSPACE, _DIGIT, _NDIGIT
    };

    static final int  _NULL_OFFSET  = -1;
--- 131,138 ----
    };

    static final char _opLengthOne[] = {
!     _ANY, _SANY, _ANYOF, _ALNUM, _NALNUM, _SPACE, _NSPACE, _DIGIT,
_NDIGIT,
!     _ANYOFUN, _NANYOFUN
    };

    static final int  _NULL_OFFSET  = -1;
diff -crN orig/src/java/org/apache/oro/text/regex/Perl5Compiler.java
src/java/org/apache/oro/text/regex/Perl5Compiler.java
*** orig/src/java/org/apache/oro/text/regex/Perl5Compiler.java Sun Jan  7
14:10:40 2001
--- src/java/org/apache/oro/text/regex/Perl5Compiler.java Mon Jan  8
04:35:15 2001
***************
*** 574,580 ****

        case '[':
   __input._increment();
!  offset = __parseCharacterClass();
   retFlags[0] |= (__NONNULL | __SIMPLE);
   break tryAgain;

--- 574,598 ----

        case '[':
   __input._increment();
!
!         int tmpoffset = __input._getOffset();
!  int length =  __input._getLength();
!  char ch;
!  boolean isunicode = false;
!
!         /* check uincode between '[' and ']' */
!  while( (tmpoffset < length) &&
!         ((ch = __input._getValue(tmpoffset++)) != ']')) {
!           if((ch & (char)0xff00) != 0){
!      isunicode = true;
!      break;
!           }
!         }
!
!         if(isunicode)
!           offset = __parseUnicodeClass();
!         else
!    offset = __parseCharacterClass();
   retFlags[0] |= (__NONNULL | __SIMPLE);
   break tryAgain;

***************
*** 799,804 ****
--- 817,827 ----
       ender = (char)__parseHex(__input._array, ++pOffset, 2, numLength);
       pOffset+=numLength[0];
       break;
+    case 'u':
+      numLength = new int[1];
+      ender = (char)__parseHex(__input._array, ++pOffset, 4, numLength);
+      pOffset+=numLength[0];
+      break;
     case 'c':
       ++pOffset;
       ender = __input._getValue(pOffset++);
***************
*** 936,942 ****
      }
    }

-
    private int __parseCharacterClass() throws MalformedPatternException {
      boolean range = false, skipTest;
      char clss, deflt, lastclss = Character.MAX_VALUE;
--- 959,964 ----
***************
*** 1034,1039 ****
--- 1056,1066 ----
        numLength);
     __input._increment(numLength[0]);
     break;
+  case 'u':
+    clss = (char)__parseHex(__input._array, __input._getOffset(), 4,
+       numLength);
+    __input._increment(numLength[0]);
+    break;
   case 'c':
     clss = __input._postIncrement();
     if(Character.isLowerCase(clss))
***************
*** 1084,1089 ****
--- 1111,1267 ----

      __getNextChar();

+     return offset;
+   }
+
+   private int __parseUnicodeClass() throws MalformedPatternException {
+     boolean range = false, skipTest;
+     char clss, lastclss = Character.MAX_VALUE;
+     int offset, numLength[] = { 0 };
+
+     if(__input._getValue() == '^') {
+       offset = __emitNode(OpCode._NANYOFUN);
+       __input._increment();
+     } else {
+       offset = __emitNode(OpCode._ANYOFUN);
+     }
+
+     clss = __input._getValue();
+
+     if(clss == ']' || clss == '-')
+       skipTest = true;
+     else
+       skipTest = false;
+
+     while((!__input._isAtEnd() && (clss = __input._getValue()) != ']')
+    || skipTest) {
+       // It sucks, but we have to make this assignment every time
+       skipTest = false;
+       __input._increment();
+       if(clss == '\\') {
+  clss = __input._postIncrement();
+
+  switch(clss){
+  case 'w':
+    clss = OpCode._ALNUM;
+    lastclss = Character.MAX_VALUE;
+    break;
+  case 'W':
+    clss = OpCode._NALNUM;
+    lastclss = Character.MAX_VALUE;
+    break;
+  case 's':
+    clss = OpCode._SPACE;
+    lastclss = Character.MAX_VALUE;
+    break;
+  case 'S':
+    clss = OpCode._NSPACE;
+    lastclss = Character.MAX_VALUE;
+    break;
+  case 'd':
+    clss = OpCode._DIGIT;
+    lastclss = Character.MAX_VALUE;
+    break;
+  case 'D':
+    clss = OpCode._NDIGIT;
+    lastclss = Character.MAX_VALUE;
+    break;
+  case 'n':
+    clss = '\n';
+    break;
+  case 'r':
+    clss = '\r';
+    break;
+  case 't':
+    clss = '\t';
+    break;
+  case 'f':
+    clss = '\f';
+    break;
+  case 'b':
+    clss = '\b';
+    break;
+  case 'e':
+    clss = '\033';
+    break;
+  case 'a':
+    clss = '\007';
+    break;
+  case 'x':
+    clss = (char)__parseHex(__input._array, __input._getOffset(), 2,
+       numLength);
+    __input._increment(numLength[0]);
+    break;
+  case 'u':
+    clss = (char)__parseHex(__input._array, __input._getOffset(), 4,
+       numLength);
+    __input._increment(numLength[0]);
+    break;
+  case 'c':
+    clss = __input._postIncrement();
+    if(Character.isLowerCase(clss))
+      clss = Character.toUpperCase(clss);
+    clss ^= 64;
+    break;
+  case '0': case '1': case '2': case '3': case '4':
+  case '5': case '6': case '7': case '8': case '9':
+    clss = (char)__parseOctal(__input._array, __input._getOffset() - 1,
+         3, numLength);
+    __input._increment(numLength[0] - 1);
+    break;
+  }
+       }
+
+       if(range) {
+  if(lastclss > clss)
+    throw new MalformedPatternException(
+     "Invalid [] range in expression.");
+  range = false;
+       } else {
+  lastclss = clss;
+
+  if(__input._getValue() == '-' &&
+     __input._getOffset() + 1 < __input._getLength() &&
+     __input._getValueRelative(1) != ']') {
+    __input._increment();
+    range = true;
+    continue;
+  }
+       }
+
+     if(lastclss == clss) {
+       __emitCode(clss);
+       if((__modifierFlags[0] & __CASE_INSENSITIVE) != 0 &&
+   Character.isUpperCase(clss) && Character.isUpperCase(lastclss)){
+  __programSize--;
+  __emitCode(Character.toLowerCase(clss));
+       }
+     }
+     if(lastclss < clss) {
+       __emitCode(OpCode._RANGE);
+       __emitCode(lastclss);
+       __emitCode(clss);
+
+       if((__modifierFlags[0] & __CASE_INSENSITIVE) != 0 &&
+   Character.isUpperCase(clss) && Character.isUpperCase(lastclss)){
+  __programSize-=2;
+  __emitCode(Character.toLowerCase(lastclss));
+  __emitCode(Character.toLowerCase(clss));
+
+
+       }
+       lastclss = Character.MAX_VALUE;
+       range = false;
+     }
+
+       lastclss = clss;
+     }
+
+     if(__input._getValue() != ']')
+       throw new MalformedPatternException("Unmatched [] in expression.");
+
+     __getNextChar();
+     __emitCode(OpCode._END);
      return offset;
    }

diff -crN orig/src/java/org/apache/oro/text/regex/Perl5Debug.java
src/java/org/apache/oro/text/regex/Perl5Debug.java
*** orig/src/java/org/apache/oro/text/regex/Perl5Debug.java Sun Jan  7
14:10:40 2001
--- src/java/org/apache/oro/text/regex/Perl5Debug.java Mon Jan  8 04:35:15
2001
***************
*** 121,126 ****
--- 121,128 ----

        if(operator == OpCode._ANYOF) {
   offset += 16;
+       } else if( operator == OpCode._ANYOFUN || operator ==
OpCode._NANYOFUN ) {
+  offset+=(prog[offset-1]-2);
        } else if(operator == OpCode._EXACTLY) {
   ++offset;
   buffer.append(" <");
***************
*** 176,181 ****
--- 178,185 ----
      case OpCode._ANY   : str = "ANY"; break;
      case OpCode._SANY  : str = "SANY"; break;
      case OpCode._ANYOF : str = "ANYOF"; break;
+     case OpCode._ANYOFUN : str = "ANYOFUN"; break;
+     case OpCode._NANYOFUN : str = "NANYOFUN"; break;
        /*
      case OpCode._ANYOF : // debug
        buffer.append("ANYOF\n\n");
diff -crN orig/src/java/org/apache/oro/text/regex/Perl5Matcher.java
src/java/org/apache/oro/text/regex/Perl5Matcher.java
*** orig/src/java/org/apache/oro/text/regex/Perl5Matcher.java Sun Jan  7
14:10:40 2001
--- src/java/org/apache/oro/text/regex/Perl5Matcher.java Mon Jan  8 04:35:15
2001
***************
*** 398,403 ****
--- 398,404 ----

        if((offset = expression._startClassOffset) != OpCode._NULL_OFFSET) {
   boolean doEvery, tmp;
+  char op;

   doEvery = ((expression._anchor & Perl5Pattern._OPT_SKIP) == 0);

***************
*** 406,412 ****
   endOffset -= dontTry;
   tmp = true;

!  switch(__program[offset]) {
   case OpCode._ANYOF:
     offset = OpCode._getOperand(offset);
     while(__currentOffset < endOffset) {
--- 407,413 ----
   endOffset -= dontTry;
   tmp = true;

!  switch(op = __program[offset]) {
   case OpCode._ANYOF:
     offset = OpCode._getOperand(offset);
     while(__currentOffset < endOffset) {
***************
*** 426,431 ****
--- 427,451 ----

     break;

+  case OpCode._ANYOFUN:
+  case OpCode._NANYOFUN:
+    offset = OpCode._getOperand(offset);
+    while(__currentOffset < endOffset) {
+      ch = __input[__currentOffset];
+
+      if(__matchUnicodeClass(ch, __program, offset, op)) {
+        if(tmp && __tryExpression(expression, __currentOffset)) {
+   success = true;
+   break _mainLoop;
+        } else
+   tmp = doEvery;
+      } else
+        tmp = true;
+      ++__currentOffset;
+    }
+
+    break;
+
   case OpCode._BOUND:
     if(minLength > 0) {
       ++dontTry;
***************
*** 599,606 ****
--- 619,672 ----

      return success;
    }
+
+   private boolean __matchUnicodeClass(char code, char __program[],
+         int offset ,char opcode)
+   {
+     boolean isANYOF = ( opcode == OpCode._ANYOFUN );
+
+     while( __program[offset] != OpCode._END ){
+
+       if( __program[offset] == OpCode._RANGE ){
+  offset++;
+  if((code >= __program[offset]) && (code <= __program[offset+1])){
+    return isANYOF;
+  } else {
+    offset+=2;
+  }
+
+       } else if( __program[offset] < 0x20 ){
+
+  switch(__program[offset++]){
+  case OpCode._ALNUM:
+    if(OpCode._isWordCharacter(code))return isANYOF;
+    break;
+  case OpCode._NALNUM:
+    if(!OpCode._isWordCharacter(code))return isANYOF;
+    break;
+  case OpCode._SPACE:
+    if(Character.isWhitespace(code))return isANYOF;
+    break;
+  case OpCode._NSPACE:
+    if(!Character.isWhitespace(code))return isANYOF;
+    break;
+  case OpCode._DIGIT:
+    if((code >= '0') && (code <='9'))return isANYOF;
+    break;
+  case OpCode._NDIGIT:
+    if((code <= '0') || (code >='9'))return isANYOF;
+    break;
+  }
+
+       } else if( code == __program[offset++] ){

+  return isANYOF;

+       }
+     }
+     return !isANYOF;
+   }
+
    private boolean __tryExpression(Perl5Pattern expression, int offset) {
      int count;

***************
*** 628,633 ****
--- 694,700 ----
    private int __repeat(int offset, int max) {
      int scan, eol, operand, ret;
      char ch;
+     char op;

      scan = __inputOffset;
      eol  = __eol;
***************
*** 637,643 ****

      operand = OpCode._getOperand(offset);

!     switch(__program[offset]) {

      case OpCode._ANY:
        while(scan < eol && __input[scan] != '\n')
--- 704,710 ----

      operand = OpCode._getOperand(offset);

!     switch(op = __program[offset]) {

      case OpCode._ANY:
        while(scan < eol && __input[scan] != '\n')
***************
*** 656,662 ****

      case OpCode._ANYOF:
        if(scan < eol && (ch = __input[scan]) < 256) {
!  while((__program[operand + (ch >> 4)] & (1 << (ch & 0xf))) == 0) {
     if(++scan < eol)
       ch = __input[scan];
     else
--- 723,742 ----

      case OpCode._ANYOF:
        if(scan < eol && (ch = __input[scan]) < 256) {
!  while((ch < 256  ) && (__program[operand + (ch >> 4)] & (1 << (ch &
0xf))) == 0) {
!    if(++scan < eol)
!      ch = __input[scan];
!    else
!      break;
!  }
!       }
!       break;
!
!     case OpCode._ANYOFUN:
!     case OpCode._NANYOFUN:
!       if(scan < eol) {
!  ch = __input[scan];
!  while(__matchUnicodeClass(ch, __program, operand, op)){
     if(++scan < eol)
       ch = __input[scan];
     else
***************
*** 807,812 ****
--- 887,909 ----

   if(nextChar >= 256 || (__program[current + (nextChar >> 4)] &
       (1 << (nextChar & 0xf))) != 0)
+    return false;
+
+  if(!inputRemains && input >= __eol)
+    return false;
+
+  inputRemains = (++input < __endOffset);
+  nextChar = (inputRemains ? __input[input] : __EOS);
+  break;
+
+       case OpCode._ANYOFUN:
+       case OpCode._NANYOFUN:
+  current = OpCode._getOperand(scan);
+
+  if(nextChar == __EOS && inputRemains)
+    nextChar = __input[input];
+
+  if(!__matchUnicodeClass(nextChar, __program, current, op))
     return false;

   if(!inputRemains && input >= __eol)