You are viewing a plain text version of this content. The canonical link for it is here.
Posted to oro-dev@jakarta.apache.org by Takashi Okamoto <to...@rd.nttdata.co.jp> on 2001/01/09 02:30:10 UTC
[PATCH] for unicode problem (Re: New Regular Expression Package: JavaRegex2 )
From: "Daniel F. Savarese" <df...@savarese.org>
To: <or...@jakarta.apache.org>
Sent: Tuesday, January 02, 2001 3:32 PM
Subject: Re: New Regular Expression Package: JavaRegex2
> However, other than the known unicode character class deficiency
> (for which there is a non-optimal, but working patch; and fixing it the
> "right" way is just a matter of someone with more time than myself to
> take the time to do it),
I have posted a unicode patch. But it has weakness which use
exhaust memory. This new patch doesn't so.
I write main topics about this patch:
add new operator _ANYOFUN and _NANYOFUN at OpCode.java.
add new method __parseUnicodeClass() at Perl5Compiler.
add new method __matchUnicodeClass() at Perl5Matcher.
If unicode is included between '[' and ']', expression will be parsed
by __parseUnicodeClass().
If unicode isn't included, expression will be parsed by
__parseCharacterClass().
This way keeps enough performance (speed and memory).
( But I spent a lot of time:( )
How about this patch, daniel?
regards.
-------
Takashi Okamoto
------------------------
diff -crN orig/src/java/org/apache/oro/text/regex/OpCode.java
src/java/org/apache/oro/text/regex/OpCode.java
*** orig/src/java/org/apache/oro/text/regex/OpCode.java Sun Jan 7 14:10:40
2001
--- src/java/org/apache/oro/text/regex/OpCode.java Mon Jan 8 04:35:15 2001
***************
*** 107,125 ****
_IFMATCH = 31, // no Succeeds if the following matches.
_UNLESSM = 32, // no Fails if the following matches.
_SUCCEED = 33, // no Return from a subroutine, basically.
! _WHILEM = 34; // no Do curly processing and see if rest
matches.
// Lengths of the various operands.
static final int _operandLength[] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0,
! 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0
};
static final char _opType[] = {
_END, _BOL, _BOL, _BOL, _EOL, _EOL, _EOL, _ANY, _ANY, _ANYOF, _CURLY,
_CURLY, _BRANCH, _BACK, _EXACTLY, _NOTHING, _STAR, _PLUS, _ALNUM,
_NALNUM, _BOUND, _NBOUND, _SPACE, _NSPACE, _DIGIT, _NDIGIT, _REF,
! _OPEN, _CLOSE, _MINMOD, _BOL, _BRANCH, _BRANCH, _END, _WHILEM
};
static final char _opLengthVaries[] = {
--- 107,129 ----
_IFMATCH = 31, // no Succeeds if the following matches.
_UNLESSM = 32, // no Fails if the following matches.
_SUCCEED = 33, // no Return from a subroutine, basically.
! _WHILEM = 34, // no Do curly processing and see if rest
matches.
! _ANYOFUN = 35, // yes Match unicode character in this class.
! _NANYOFUN= 36, // yes Match unicode character not in this
class.
! _RANGE = 1; // yes Range flag in
// Lengths of the various operands.
static final int _operandLength[] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0,
! 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0
};
static final char _opType[] = {
_END, _BOL, _BOL, _BOL, _EOL, _EOL, _EOL, _ANY, _ANY, _ANYOF, _CURLY,
_CURLY, _BRANCH, _BACK, _EXACTLY, _NOTHING, _STAR, _PLUS, _ALNUM,
_NALNUM, _BOUND, _NBOUND, _SPACE, _NSPACE, _DIGIT, _NDIGIT, _REF,
! _OPEN, _CLOSE, _MINMOD, _BOL, _BRANCH, _BRANCH, _END, _WHILEM,
! _ANYOFUN, _NANYOFUN
};
static final char _opLengthVaries[] = {
***************
*** 127,133 ****
};
static final char _opLengthOne[] = {
! _ANY, _SANY, _ANYOF, _ALNUM, _NALNUM, _SPACE, _NSPACE, _DIGIT, _NDIGIT
};
static final int _NULL_OFFSET = -1;
--- 131,138 ----
};
static final char _opLengthOne[] = {
! _ANY, _SANY, _ANYOF, _ALNUM, _NALNUM, _SPACE, _NSPACE, _DIGIT,
_NDIGIT,
! _ANYOFUN, _NANYOFUN
};
static final int _NULL_OFFSET = -1;
diff -crN orig/src/java/org/apache/oro/text/regex/Perl5Compiler.java
src/java/org/apache/oro/text/regex/Perl5Compiler.java
*** orig/src/java/org/apache/oro/text/regex/Perl5Compiler.java Sun Jan 7
14:10:40 2001
--- src/java/org/apache/oro/text/regex/Perl5Compiler.java Mon Jan 8
04:35:15 2001
***************
*** 574,580 ****
case '[':
__input._increment();
! offset = __parseCharacterClass();
retFlags[0] |= (__NONNULL | __SIMPLE);
break tryAgain;
--- 574,598 ----
case '[':
__input._increment();
!
! int tmpoffset = __input._getOffset();
! int length = __input._getLength();
! char ch;
! boolean isunicode = false;
!
! /* check uincode between '[' and ']' */
! while( (tmpoffset < length) &&
! ((ch = __input._getValue(tmpoffset++)) != ']')) {
! if((ch & (char)0xff00) != 0){
! isunicode = true;
! break;
! }
! }
!
! if(isunicode)
! offset = __parseUnicodeClass();
! else
! offset = __parseCharacterClass();
retFlags[0] |= (__NONNULL | __SIMPLE);
break tryAgain;
***************
*** 799,804 ****
--- 817,827 ----
ender = (char)__parseHex(__input._array, ++pOffset, 2, numLength);
pOffset+=numLength[0];
break;
+ case 'u':
+ numLength = new int[1];
+ ender = (char)__parseHex(__input._array, ++pOffset, 4, numLength);
+ pOffset+=numLength[0];
+ break;
case 'c':
++pOffset;
ender = __input._getValue(pOffset++);
***************
*** 936,942 ****
}
}
-
private int __parseCharacterClass() throws MalformedPatternException {
boolean range = false, skipTest;
char clss, deflt, lastclss = Character.MAX_VALUE;
--- 959,964 ----
***************
*** 1034,1039 ****
--- 1056,1066 ----
numLength);
__input._increment(numLength[0]);
break;
+ case 'u':
+ clss = (char)__parseHex(__input._array, __input._getOffset(), 4,
+ numLength);
+ __input._increment(numLength[0]);
+ break;
case 'c':
clss = __input._postIncrement();
if(Character.isLowerCase(clss))
***************
*** 1084,1089 ****
--- 1111,1267 ----
__getNextChar();
+ return offset;
+ }
+
+ private int __parseUnicodeClass() throws MalformedPatternException {
+ boolean range = false, skipTest;
+ char clss, lastclss = Character.MAX_VALUE;
+ int offset, numLength[] = { 0 };
+
+ if(__input._getValue() == '^') {
+ offset = __emitNode(OpCode._NANYOFUN);
+ __input._increment();
+ } else {
+ offset = __emitNode(OpCode._ANYOFUN);
+ }
+
+ clss = __input._getValue();
+
+ if(clss == ']' || clss == '-')
+ skipTest = true;
+ else
+ skipTest = false;
+
+ while((!__input._isAtEnd() && (clss = __input._getValue()) != ']')
+ || skipTest) {
+ // It sucks, but we have to make this assignment every time
+ skipTest = false;
+ __input._increment();
+ if(clss == '\\') {
+ clss = __input._postIncrement();
+
+ switch(clss){
+ case 'w':
+ clss = OpCode._ALNUM;
+ lastclss = Character.MAX_VALUE;
+ break;
+ case 'W':
+ clss = OpCode._NALNUM;
+ lastclss = Character.MAX_VALUE;
+ break;
+ case 's':
+ clss = OpCode._SPACE;
+ lastclss = Character.MAX_VALUE;
+ break;
+ case 'S':
+ clss = OpCode._NSPACE;
+ lastclss = Character.MAX_VALUE;
+ break;
+ case 'd':
+ clss = OpCode._DIGIT;
+ lastclss = Character.MAX_VALUE;
+ break;
+ case 'D':
+ clss = OpCode._NDIGIT;
+ lastclss = Character.MAX_VALUE;
+ break;
+ case 'n':
+ clss = '\n';
+ break;
+ case 'r':
+ clss = '\r';
+ break;
+ case 't':
+ clss = '\t';
+ break;
+ case 'f':
+ clss = '\f';
+ break;
+ case 'b':
+ clss = '\b';
+ break;
+ case 'e':
+ clss = '\033';
+ break;
+ case 'a':
+ clss = '\007';
+ break;
+ case 'x':
+ clss = (char)__parseHex(__input._array, __input._getOffset(), 2,
+ numLength);
+ __input._increment(numLength[0]);
+ break;
+ case 'u':
+ clss = (char)__parseHex(__input._array, __input._getOffset(), 4,
+ numLength);
+ __input._increment(numLength[0]);
+ break;
+ case 'c':
+ clss = __input._postIncrement();
+ if(Character.isLowerCase(clss))
+ clss = Character.toUpperCase(clss);
+ clss ^= 64;
+ break;
+ case '0': case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8': case '9':
+ clss = (char)__parseOctal(__input._array, __input._getOffset() - 1,
+ 3, numLength);
+ __input._increment(numLength[0] - 1);
+ break;
+ }
+ }
+
+ if(range) {
+ if(lastclss > clss)
+ throw new MalformedPatternException(
+ "Invalid [] range in expression.");
+ range = false;
+ } else {
+ lastclss = clss;
+
+ if(__input._getValue() == '-' &&
+ __input._getOffset() + 1 < __input._getLength() &&
+ __input._getValueRelative(1) != ']') {
+ __input._increment();
+ range = true;
+ continue;
+ }
+ }
+
+ if(lastclss == clss) {
+ __emitCode(clss);
+ if((__modifierFlags[0] & __CASE_INSENSITIVE) != 0 &&
+ Character.isUpperCase(clss) && Character.isUpperCase(lastclss)){
+ __programSize--;
+ __emitCode(Character.toLowerCase(clss));
+ }
+ }
+ if(lastclss < clss) {
+ __emitCode(OpCode._RANGE);
+ __emitCode(lastclss);
+ __emitCode(clss);
+
+ if((__modifierFlags[0] & __CASE_INSENSITIVE) != 0 &&
+ Character.isUpperCase(clss) && Character.isUpperCase(lastclss)){
+ __programSize-=2;
+ __emitCode(Character.toLowerCase(lastclss));
+ __emitCode(Character.toLowerCase(clss));
+
+
+ }
+ lastclss = Character.MAX_VALUE;
+ range = false;
+ }
+
+ lastclss = clss;
+ }
+
+ if(__input._getValue() != ']')
+ throw new MalformedPatternException("Unmatched [] in expression.");
+
+ __getNextChar();
+ __emitCode(OpCode._END);
return offset;
}
diff -crN orig/src/java/org/apache/oro/text/regex/Perl5Debug.java
src/java/org/apache/oro/text/regex/Perl5Debug.java
*** orig/src/java/org/apache/oro/text/regex/Perl5Debug.java Sun Jan 7
14:10:40 2001
--- src/java/org/apache/oro/text/regex/Perl5Debug.java Mon Jan 8 04:35:15
2001
***************
*** 121,126 ****
--- 121,128 ----
if(operator == OpCode._ANYOF) {
offset += 16;
+ } else if( operator == OpCode._ANYOFUN || operator ==
OpCode._NANYOFUN ) {
+ offset+=(prog[offset-1]-2);
} else if(operator == OpCode._EXACTLY) {
++offset;
buffer.append(" <");
***************
*** 176,181 ****
--- 178,185 ----
case OpCode._ANY : str = "ANY"; break;
case OpCode._SANY : str = "SANY"; break;
case OpCode._ANYOF : str = "ANYOF"; break;
+ case OpCode._ANYOFUN : str = "ANYOFUN"; break;
+ case OpCode._NANYOFUN : str = "NANYOFUN"; break;
/*
case OpCode._ANYOF : // debug
buffer.append("ANYOF\n\n");
diff -crN orig/src/java/org/apache/oro/text/regex/Perl5Matcher.java
src/java/org/apache/oro/text/regex/Perl5Matcher.java
*** orig/src/java/org/apache/oro/text/regex/Perl5Matcher.java Sun Jan 7
14:10:40 2001
--- src/java/org/apache/oro/text/regex/Perl5Matcher.java Mon Jan 8 04:35:15
2001
***************
*** 398,403 ****
--- 398,404 ----
if((offset = expression._startClassOffset) != OpCode._NULL_OFFSET) {
boolean doEvery, tmp;
+ char op;
doEvery = ((expression._anchor & Perl5Pattern._OPT_SKIP) == 0);
***************
*** 406,412 ****
endOffset -= dontTry;
tmp = true;
! switch(__program[offset]) {
case OpCode._ANYOF:
offset = OpCode._getOperand(offset);
while(__currentOffset < endOffset) {
--- 407,413 ----
endOffset -= dontTry;
tmp = true;
! switch(op = __program[offset]) {
case OpCode._ANYOF:
offset = OpCode._getOperand(offset);
while(__currentOffset < endOffset) {
***************
*** 426,431 ****
--- 427,451 ----
break;
+ case OpCode._ANYOFUN:
+ case OpCode._NANYOFUN:
+ offset = OpCode._getOperand(offset);
+ while(__currentOffset < endOffset) {
+ ch = __input[__currentOffset];
+
+ if(__matchUnicodeClass(ch, __program, offset, op)) {
+ if(tmp && __tryExpression(expression, __currentOffset)) {
+ success = true;
+ break _mainLoop;
+ } else
+ tmp = doEvery;
+ } else
+ tmp = true;
+ ++__currentOffset;
+ }
+
+ break;
+
case OpCode._BOUND:
if(minLength > 0) {
++dontTry;
***************
*** 599,606 ****
--- 619,672 ----
return success;
}
+
+ private boolean __matchUnicodeClass(char code, char __program[],
+ int offset ,char opcode)
+ {
+ boolean isANYOF = ( opcode == OpCode._ANYOFUN );
+
+ while( __program[offset] != OpCode._END ){
+
+ if( __program[offset] == OpCode._RANGE ){
+ offset++;
+ if((code >= __program[offset]) && (code <= __program[offset+1])){
+ return isANYOF;
+ } else {
+ offset+=2;
+ }
+
+ } else if( __program[offset] < 0x20 ){
+
+ switch(__program[offset++]){
+ case OpCode._ALNUM:
+ if(OpCode._isWordCharacter(code))return isANYOF;
+ break;
+ case OpCode._NALNUM:
+ if(!OpCode._isWordCharacter(code))return isANYOF;
+ break;
+ case OpCode._SPACE:
+ if(Character.isWhitespace(code))return isANYOF;
+ break;
+ case OpCode._NSPACE:
+ if(!Character.isWhitespace(code))return isANYOF;
+ break;
+ case OpCode._DIGIT:
+ if((code >= '0') && (code <='9'))return isANYOF;
+ break;
+ case OpCode._NDIGIT:
+ if((code <= '0') || (code >='9'))return isANYOF;
+ break;
+ }
+
+ } else if( code == __program[offset++] ){
+ return isANYOF;
+ }
+ }
+ return !isANYOF;
+ }
+
private boolean __tryExpression(Perl5Pattern expression, int offset) {
int count;
***************
*** 628,633 ****
--- 694,700 ----
private int __repeat(int offset, int max) {
int scan, eol, operand, ret;
char ch;
+ char op;
scan = __inputOffset;
eol = __eol;
***************
*** 637,643 ****
operand = OpCode._getOperand(offset);
! switch(__program[offset]) {
case OpCode._ANY:
while(scan < eol && __input[scan] != '\n')
--- 704,710 ----
operand = OpCode._getOperand(offset);
! switch(op = __program[offset]) {
case OpCode._ANY:
while(scan < eol && __input[scan] != '\n')
***************
*** 656,662 ****
case OpCode._ANYOF:
if(scan < eol && (ch = __input[scan]) < 256) {
! while((__program[operand + (ch >> 4)] & (1 << (ch & 0xf))) == 0) {
if(++scan < eol)
ch = __input[scan];
else
--- 723,742 ----
case OpCode._ANYOF:
if(scan < eol && (ch = __input[scan]) < 256) {
! while((ch < 256 ) && (__program[operand + (ch >> 4)] & (1 << (ch &
0xf))) == 0) {
! if(++scan < eol)
! ch = __input[scan];
! else
! break;
! }
! }
! break;
!
! case OpCode._ANYOFUN:
! case OpCode._NANYOFUN:
! if(scan < eol) {
! ch = __input[scan];
! while(__matchUnicodeClass(ch, __program, operand, op)){
if(++scan < eol)
ch = __input[scan];
else
***************
*** 807,812 ****
--- 887,909 ----
if(nextChar >= 256 || (__program[current + (nextChar >> 4)] &
(1 << (nextChar & 0xf))) != 0)
+ return false;
+
+ if(!inputRemains && input >= __eol)
+ return false;
+
+ inputRemains = (++input < __endOffset);
+ nextChar = (inputRemains ? __input[input] : __EOS);
+ break;
+
+ case OpCode._ANYOFUN:
+ case OpCode._NANYOFUN:
+ current = OpCode._getOperand(scan);
+
+ if(nextChar == __EOS && inputRemains)
+ nextChar = __input[input];
+
+ if(!__matchUnicodeClass(nextChar, __program, current, op))
return false;
if(!inputRemains && input >= __eol)