You are viewing a plain text version of this content. The canonical link for it is here.
Posted to oro-dev@jakarta.apache.org by Takashi Okamoto <to...@rd.nttdata.co.jp> on 2000/11/10 20:26:54 UTC

[PATCH] for unicode problem over 0xff characters

Hello ,jakarta-oro developers.

I made a patch for unicode problem at Perl5Compiler.java and
Perl5Matcher.java.
Now jakarta-oro has following problems,

[Problem 1]

 Perl5Util perl = new Perl5Util();
 boolean result = perl.match("m![a-z]+!", "abcdX");


 'X' is unicode character over 0xff.

 This matching throws fatal Exception!!

[Problem 2]

 Perl5Util perl = new Perl5Util();
 boolean result = perl.match("m![X-Y]+!", "ABCDEF");


 'X' and 'Y' are unicode characters over 0xff.
 'ABCDEF' is also unicode characters between 'X' and 'Y'.
 This matching result is false!! (true is right)


But these problems will not occur after attach my patch to
Perl5Compiler.java and Perl5Matcher.java.

Maybe this patch isn't so good idea ,because I don't know jakarta-oro
code so much.

But I hope next jakarta-oro release resolve these unicode problems.


Regards.


PS.
This patch  is for CVS 2000/11/10 version's source.

install memo

[1] download jakarta-oro from CVS
[2] cd jakarta-oro/src/java/org/apache/oro/text/regexp
[3] patch -p1 < [patch tail of this mail]
[4] build jakarta-oro
------------------------
Takashi Okamoto



------- patch for Perl5Compiler.java and Perl5Matcher.java -------

*** Perl5Compiler.java.org Fri Nov 10 09:55:15 2000
--- Perl5Compiler.java Fri Nov 10 09:09:34 2000
***************
*** 925,934 ****
    private void __setCharacterClassBits(char[] bits, int offset, char
deflt,
             char ch)
    {
!     if(__program== null || ch >= 256)
        return;
-     ch &= 0xffff;

      if(deflt == 0) {
        bits[offset + (ch >> 4)] |= (1 << (ch & 0xf));
      } else {
--- 925,935 ----
    private void __setCharacterClassBits(char[] bits, int offset, char
deflt,
             char ch)
    {
!     if(__program == null)
        return;

+     __extendProgramSize( offset + (ch >> 4) );
+     ch &= 0xffff;
      if(deflt == 0) {
        bits[offset + (ch >> 4)] |= (1 << (ch & 0xf));
      } else {
***************
*** 936,942 ****
      }
    }

!
    private int __parseCharacterClass() throws MalformedPatternException {
      boolean range = false, skipTest;
      char clss, deflt, lastclss = Character.MAX_VALUE;
--- 937,949 ----
      }
    }

!   private void __extendProgramSize (int  max)
!   {
!       if( max > __programSize ) {
!    __programSize = max + 1;
!       }
!   }
!
    private int __parseCharacterClass() throws MalformedPatternException {
      boolean range = false, skipTest;
      char clss, deflt, lastclss = Character.MAX_VALUE;
***************
*** 1468,1475 ****
      if(__programSize >= Character.MAX_VALUE - 1)
        throw new MalformedPatternException("Expression is too large.");


-     __program= new char[__programSize];
      regexp = new Perl5Pattern();

      regexp._program    = __program;
--- 1475,1486 ----
      if(__programSize >= Character.MAX_VALUE - 1)
        throw new MalformedPatternException("Expression is too large.");

+     __program = new char[Character.MAX_VALUE >> 4];
+
+     for(int i = 0 ;i < Character.MAX_VALUE >> 4 ;i++){
+        __program[i] = Character.MAX_VALUE;
+     }

      regexp = new Perl5Pattern();

      regexp._program    = __program;
*** Perl5Matcher.java.org Fri Nov 10 09:55:37 2000
--- Perl5Matcher.java Fri Nov 10 09:29:51 2000
***************
*** 412,418 ****
     while(__currentOffset < endOffset) {
       ch = __input[__currentOffset];

!      if(ch < 256 &&
          (__program[offset + (ch >> 4)] & (1 << (ch & 0xf))) == 0) {
         if(tmp && __tryExpression(expression, __currentOffset)) {
    success = true;
--- 412,418 ----
     while(__currentOffset < endOffset) {
       ch = __input[__currentOffset];

!      if(
          (__program[offset + (ch >> 4)] & (1 << (ch & 0xf))) == 0) {
         if(tmp && __tryExpression(expression, __currentOffset)) {
    success = true;
***************
*** 655,661 ****
        break;

      case OpCode._ANYOF:
!       if(scan < eol && (ch = __input[scan]) < 256) {
   while((__program[operand + (ch >> 4)] & (1 << (ch & 0xf))) == 0) {
     if(++scan < eol)
       ch = __input[scan];
--- 655,662 ----
        break;

      case OpCode._ANYOF:
!       if(scan < eol ) {
!       ch = __input[scan];
   while((__program[operand + (ch >> 4)] & (1 << (ch & 0xf))) == 0) {
     if(++scan < eol)
       ch = __input[scan];
***************
*** 805,811 ****
   if(nextChar == __EOS && inputRemains)
     nextChar = __input[input];

!  if(nextChar >= 256 || (__program[current + (nextChar >> 4)] &
       (1 << (nextChar & 0xf))) != 0)
     return false;

--- 806,812 ----
   if(nextChar == __EOS && inputRemains)
     nextChar = __input[input];

!  if((__program[current + (nextChar >> 4)] &
       (1 << (nextChar & 0xf))) != 0)
     return false;