You are viewing a plain text version of this content. The canonical link for it is here.
Posted to oro-dev@jakarta.apache.org by df...@apache.org on 2001/01/29 01:19:01 UTC
cvs commit: jakarta-oro/src/java/org/apache/oro/text/regex OpCode.java Perl5Compiler.java Perl5Debug.java Perl5Matcher.java

dfs         01/01/28 16:19:01

  Modified:    .        CHANGES TODO
               build    build-oro.xml
               src/java/org/apache/oro/text/regex OpCode.java
                        Perl5Compiler.java Perl5Debug.java
                        Perl5Matcher.java
  Added:       .        CONTRIBUTORS
  Log:
  o Applied a modified version of Takashi Okamoto's Unicode and POSIX character
    class patch to OpCode, Perl5compiler, Perl5Debug, and Perl5Matcher.
  
  o Removed Unicode from the TODO and added improve/optimize Unicode classes.
  
  o Added a CONTRIBUTORS file to keep track of those who have contributed code
    to the project.
  
  o Incremented release to 2.0.2-dev-2
  
  Revision  Changes    Path
  1.3       +13 -2     jakarta-oro/CHANGES
  
  Index: CHANGES
  ===================================================================
  RCS file: /home/cvs/jakarta-oro/CHANGES,v
  retrieving revision 1.2
  retrieving revision 1.3
  diff -u -r1.2 -r1.3
  --- CHANGES	2000/12/24 03:34:23	1.2
  +++ CHANGES	2001/01/29 00:19:00	1.3
  @@ -1,6 +1,17 @@
  -$Id: CHANGES,v 1.2 2000/12/24 03:34:23 dfs Exp $
  +$Id: CHANGES,v 1.3 2001/01/29 00:19:00 dfs Exp $
   
  -Version 2.0.2-dev
  +Version 2.0.2-dev-2
  +
  +o Applied a modified version of Takashi Okamoto's unicode/posix patch.
  +  It adds unicode support to character classes and adds partial support
  +  for posix classes (it supports things like [:digit:] and [:print:], but
  +  not [:^digit:] and [:^print:]).  It will be improved/optimized later, but
  +  gives people the functionality they need today.
  +
  +Version 2.0.2-dev-1
  +
  +o Removed commented out code and changed OpCode._isWordCharacter() to
  +  use Character.isLetterOrDigit()
   
   o Some documentation fixes.
   
  
  
  
  1.2       +2 -3      jakarta-oro/TODO
  
  Index: TODO
  ===================================================================
  RCS file: /home/cvs/jakarta-oro/TODO,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- TODO	2000/07/23 23:08:28	1.1
  +++ TODO	2001/01/29 00:19:00	1.2
  @@ -1,7 +1,6 @@
  -$Id: TODO,v 1.1 2000/07/23 23:08:28 jon Exp $
  +$Id: TODO,v 1.2 2001/01/29 00:19:00 dfs Exp $
   
  -o Make Perl5 character classes (e.g., [abcde...]) fully support Unicode
  -  input.  Currently character classes only match 8-bit characters.
  +o Optimize/improve Unicode character classes.
   
   o Fix any pending bugs listed in BUGS file.
   
  
  
  
  1.1                  jakarta-oro/CONTRIBUTORS
  
  Index: CONTRIBUTORS
  ===================================================================
  $Id: CONTRIBUTORS,v 1.1 2001/01/29 00:19:00 dfs Exp $
  
  Daniel Savarese <df...@savarese.org> is the original author of the
  OROMatcher, PerlTools, AwkTools, and TextTools packages that became
  the Jakarta-ORO project.
  
  Takashi Okamoto <to...@rd.nttdata.co.jp> has contributed a unicode
  character class fix and an initial posix character class implementation.
  
  
  
  1.9       +2 -2      jakarta-oro/build/build-oro.xml
  
  Index: build-oro.xml
  ===================================================================
  RCS file: /home/cvs/jakarta-oro/build/build-oro.xml,v
  retrieving revision 1.8
  retrieving revision 1.9
  diff -u -r1.8 -r1.9
  --- build-oro.xml	2000/12/24 03:34:23	1.8
  +++ build-oro.xml	2001/01/29 00:19:00	1.9
  @@ -1,5 +1,5 @@
   <?xml version="1.0"?>
  -<!-- $Id: build-oro.xml,v 1.8 2000/12/24 03:34:23 dfs Exp $ -->
  +<!-- $Id: build-oro.xml,v 1.9 2001/01/29 00:19:00 dfs Exp $ -->
   
   <project name="Jakarta-ORO" default="main" basedir=".">
   
  @@ -8,7 +8,7 @@
   <target name="init">
     <property name="Name" value="Jakarta-ORO"/>
     <property name="year" value="2000"/>
  -  <property name="version" value="2.0.2-dev-1"/>
  +  <property name="version" value="2.0.2-dev-2"/>
     <property name="project" value="jakarta-oro"/>
     <property name="build.compiler" value="classic"/>
     <property name="code.src" value="../src"/>
  
  
  
  1.4       +25 -5     jakarta-oro/src/java/org/apache/oro/text/regex/OpCode.java
  
  Index: OpCode.java
  ===================================================================
  RCS file: /home/cvs/jakarta-oro/src/java/org/apache/oro/text/regex/OpCode.java,v
  retrieving revision 1.3
  retrieving revision 1.4
  diff -u -r1.3 -r1.4
  --- OpCode.java	2001/01/28 22:21:57	1.3
  +++ OpCode.java	2001/01/29 00:19:00	1.4
  @@ -63,7 +63,7 @@
    * op-codes used in a compiled regular expression.
   
    @author <a href="mailto:dfs@savarese.org">Daniel F. Savarese</a>
  - @version $Id: OpCode.java,v 1.3 2001/01/28 22:21:57 dfs Exp $
  + @version $Id: OpCode.java,v 1.4 2001/01/29 00:19:00 dfs Exp $
    */
   final class OpCode {
   
  @@ -107,19 +107,36 @@
        _IFMATCH = 31,  // no       Succeeds if the following matches.
        _UNLESSM = 32,  // no       Fails if the following matches.
        _SUCCEED = 33,  // no       Return from a subroutine, basically.
  -     _WHILEM  = 34;  // no       Do curly processing and see if rest matches.
  +     _WHILEM  = 34,  // no       Do curly processing and see if rest matches.
  +     _ANYOFUN = 35,  // yes      Match unicode character in this class.
  +     _NANYOFUN= 36,  // yes      Match unicode character not in this class.
  +     _RANGE   = 37,  // yes      Range flag in 
  +    // Change the names of these constants later to make it clear they
  +    // are POSIX classes.
  +     _ALPHA   = 38,
  +     _BLANK   = 39,
  +     _CNTRL   = 40,
  +     _GRAPH   = 41,
  +     _LOWER   = 42,
  +     _PRINT   = 43,
  +     _PUNCT   = 44,
  +     _UPPER   = 45,
  +     _XDIGIT  = 46,
  +     _OPCODE  = 47,
  +     _ONECHAR = 48;
   
     // Lengths of the various operands.
     static final int _operandLength[] = {
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0,
  -    0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0
  +    0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0
     };
   
     static final char _opType[] = {
   	_END, _BOL, _BOL, _BOL, _EOL, _EOL, _EOL, _ANY, _ANY, _ANYOF, _CURLY,
   	_CURLY, _BRANCH, _BACK, _EXACTLY, _NOTHING, _STAR, _PLUS, _ALNUM,
   	_NALNUM, _BOUND, _NBOUND, _SPACE, _NSPACE, _DIGIT, _NDIGIT, _REF,
  -	_OPEN, _CLOSE, _MINMOD,	_BOL, _BRANCH, _BRANCH, _END, _WHILEM
  +	_OPEN, _CLOSE, _MINMOD,	_BOL, _BRANCH, _BRANCH, _END, _WHILEM,
  +	_ANYOFUN, _NANYOFUN
     };
   
     static final char _opLengthVaries[] = {
  @@ -127,7 +144,8 @@
     };
   
     static final char _opLengthOne[] = {
  -    _ANY, _SANY, _ANYOF, _ALNUM, _NALNUM, _SPACE, _NSPACE, _DIGIT, _NDIGIT
  +    _ANY, _SANY, _ANYOF, _ALNUM, _NALNUM, _SPACE, _NSPACE, _DIGIT, _NDIGIT, 
  +    _ANYOFUN, _NANYOFUN
     };
   
     static final int  _NULL_OFFSET  = -1;
  @@ -181,5 +199,7 @@
     // Matches Perl's definition of \w, which is different from [:alnum:]
     static final boolean _isWordCharacter(char token) {
       return (Character.isLetterOrDigit(token) || token == '_');
  -  }        
  +  }
   }
  +
  +
  
  
  
  1.6       +236 -1    jakarta-oro/src/java/org/apache/oro/text/regex/Perl5Compiler.java
  
  Index: Perl5Compiler.java
  ===================================================================
  RCS file: /home/cvs/jakarta-oro/src/java/org/apache/oro/text/regex/Perl5Compiler.java,v
  retrieving revision 1.5
  retrieving revision 1.6
  diff -u -r1.5 -r1.6
  --- Perl5Compiler.java	2001/01/28 22:21:57	1.5
  +++ Perl5Compiler.java	2001/01/29 00:19:01	1.6
  @@ -57,6 +57,8 @@
    * by Daniel F. Savarese. We appreciate his contributions.
    */
   
  +import java.util.Hashtable;
  +
   /**
    * The Perl5Compiler class is used to create compiled regular expressions
    * conforming to the Perl5 regular expression syntax.  It generates
  @@ -65,7 +67,7 @@
    * information about Perl5 regular expressions.
   
    @author <a href="mailto:dfs@savarese.org">Daniel F. Savarese</a>
  - @version $Id: Perl5Compiler.java,v 1.5 2001/01/28 22:21:57 dfs Exp $
  + @version $Id: Perl5Compiler.java,v 1.6 2001/01/29 00:19:01 dfs Exp $
   
    * @see PatternCompiler
    * @see MalformedPatternException
  @@ -103,6 +105,26 @@
     // keeps track of the current offset.
     private char[] __program;
   
  +  /** Lookup table for POSIX character class names */
  +  private static final HashMap __hashPOSIX;
  +  
  +  static {
  +    __hashPOSIX = new HashMap();
  +    __hashPOSIX.put("alnum",     new Character('w'));
  +    __hashPOSIX.put("alpha",     new Character(OpCode._ALPHA));
  +    __hashPOSIX.put("blank",     new Character(OpCode._BLANK));
  +    __hashPOSIX.put("cntrl",     new Character(OpCode._CNTRL));
  +    __hashPOSIX.put("digit",     new Character('d'));
  +    __hashPOSIX.put("graph",     new Character(OpCode._GRAPH));
  +    __hashPOSIX.put("lower",     new Character(OpCode._LOWER));
  +    __hashPOSIX.put("print",     new Character(OpCode._PRINT));
  +    __hashPOSIX.put("punct",     new Character(OpCode._PUNCT));
  +    __hashPOSIX.put("space",     new Character('s'));
  +    __hashPOSIX.put("upper",     new Character(OpCode._UPPER));
  +    __hashPOSIX.put("xdigit",    new Character(OpCode._XDIGIT));
  +  }
  +
  +
     /**
      * The default mask for the {@link #compile compile} methods.
      * It is equal to 0.
  @@ -567,7 +589,7 @@
   
         case '[':
   	__input._increment();
  -	offset = __parseCharacterClass();
  +	offset = __parseUnicodeClass();
   	retFlags[0] |= (__NONNULL | __SIMPLE);
   	break tryAgain;
   
  @@ -1078,6 +1100,219 @@
       __getNextChar();
   
       return offset;
  +  }
  +
  +
  +  private int __parseUnicodeClass() throws MalformedPatternException {
  +    boolean range = false, skipTest;
  +    char clss, lastclss = Character.MAX_VALUE;
  +    int offset, numLength[] = { 0 };
  +    boolean opcodeFlag; /* clss isn't character when this flag true. */
  +
  +    if(__input._getValue() == '^') {
  +      offset = __emitNode(OpCode._NANYOFUN);
  +      __input._increment();
  +    } else {
  +      offset = __emitNode(OpCode._ANYOFUN);
  +    }
  +
  +    clss = __input._getValue();
  +
  +    if(clss == ']' || clss == '-')
  +      skipTest = true;
  +    else
  +      skipTest = false;
  +
  +    while((!__input._isAtEnd() && (clss = __input._getValue()) != ']')
  +	  || skipTest) {
  +      // It sucks, but we have to make this assignment every time
  +      skipTest = false;
  +      opcodeFlag = false;
  +      __input._increment();
  +
  +      if(clss == '\\' || clss == '[') {
  +	if(clss == '\\') {
  +	  /* character is escaped */
  +	  clss = __input._postIncrement();
  +	} else {
  +	  /* try POSIX expression */
  +	  char posixOpCode = __parsePOSIX();
  +	  if(posixOpCode != 0){
  +	    opcodeFlag = true;
  +	    clss = posixOpCode;
  +	  }
  +	}
  +
  +	switch(clss){
  +	case 'w':
  +	  opcodeFlag = true;
  +	  clss = OpCode._ALNUM;
  +	  lastclss = Character.MAX_VALUE;
  +	  break;
  +	case 'W':
  +	  opcodeFlag = true;
  +	  clss = OpCode._NALNUM;
  +	  lastclss = Character.MAX_VALUE;
  +	  break;
  +	case 's':
  +	  opcodeFlag = true;
  +	  clss = OpCode._SPACE;
  +	  lastclss = Character.MAX_VALUE;
  +	  break;
  +	case 'S':
  +	  opcodeFlag = true;
  +	  clss = OpCode._NSPACE;
  +	  lastclss = Character.MAX_VALUE;
  +	  break;
  +	case 'd':
  +	  opcodeFlag = true;
  +	  clss = OpCode._DIGIT;
  +	  lastclss = Character.MAX_VALUE;
  +	  break;
  +	case 'D':
  +	  opcodeFlag = true;
  +	  clss = OpCode._NDIGIT;
  +	  lastclss = Character.MAX_VALUE;
  +	  break;
  +	case 'n':
  +	  clss = '\n';
  +	  break;
  +	case 'r':
  +	  clss = '\r';
  +	  break;
  +	case 't':
  +	  clss = '\t';
  +	  break;
  +	case 'f':
  +	  clss = '\f';
  +	  break;
  +	case 'b':
  +	  clss = '\b';
  +	  break;
  +	case 'e':
  +	  clss = '\033';
  +	  break;
  +	case 'a':
  +	  clss = '\007';
  +	  break;
  +	case 'x':
  +	  clss = (char)__parseHex(__input._array, __input._getOffset(), 2,
  +				  numLength);
  +	  __input._increment(numLength[0]);
  +	  break;
  +	case 'c':
  +	  clss = __input._postIncrement();
  +	  if(Character.isLowerCase(clss))
  +	    clss = Character.toUpperCase(clss);
  +	  clss ^= 64;
  +	  break;
  +	case '0': case '1': case '2': case '3': case '4':
  +	case '5': case '6': case '7': case '8': case '9':
  +	  clss = (char)__parseOctal(__input._array, __input._getOffset() - 1,
  +				    3, numLength);
  +	  __input._increment(numLength[0] - 1);
  +	  break;
  +	default:
  +	}
  +      }
  +
  +      if(range) {
  +	if(lastclss > clss)
  +	  throw new MalformedPatternException(
  +			 "Invalid [] range in expression.");
  +	range = false;
  +      } else {
  +	lastclss = clss;
  +
  +	if(__input._getValue() == '-' &&
  +	   __input._getOffset() + 1 < __input._getLength() &&
  +	   __input._getValueRelative(1) != ']') {
  +	  __input._increment();
  +	  range = true;
  +	  continue;
  +	}
  +      }
  +
  +    if(lastclss == clss) {
  +      if(opcodeFlag == true) {
  +	__emitCode(OpCode._OPCODE);
  +      } else {
  +	__emitCode(OpCode._ONECHAR);
  +      }
  +      __emitCode(clss);
  +
  +      if((__modifierFlags[0] & __CASE_INSENSITIVE) != 0 &&
  +	 Character.isUpperCase(clss) && Character.isUpperCase(lastclss)){
  +	__programSize--;
  +	__emitCode(Character.toLowerCase(clss));
  +      }
  +    }
  +    if(lastclss < clss) {
  +      __emitCode(OpCode._RANGE);
  +      __emitCode(lastclss);
  +      __emitCode(clss);
  +
  +      if((__modifierFlags[0] & __CASE_INSENSITIVE) != 0 &&
  +	 Character.isUpperCase(clss) && Character.isUpperCase(lastclss)){
  +	__programSize-=2;
  +	__emitCode(Character.toLowerCase(lastclss));
  +	__emitCode(Character.toLowerCase(clss));
  +
  +
  +      }
  +      lastclss = Character.MAX_VALUE;
  +      range = false;
  +    }
  +    
  +      lastclss = clss;
  +    }
  +
  +    if(__input._getValue() != ']')
  +      throw new MalformedPatternException("Unmatched [] in expression.");
  +
  +    __getNextChar();
  +    __emitCode(OpCode._END);
  +    return offset;
  +  }
  +
  +
  +  /**
  +   * parse POSIX exression like [:foo:]. 
  +   * 
  +   * @return OpCode. return 0 when fail parsing POSIX expression.
  +   */
  +  private char __parsePOSIX() throws MalformedPatternException {
  +    int offset = __input._getOffset();
  +    int len = __input._getLength();
  +    int pos = offset;
  +    char value = __input._getValue(pos++);
  +    StringBuffer buf;
  +    Object opcode;
  +
  +    if( value != ':' ) return 0;
  +
  +    buf = new StringBuffer();
  +    
  +    try { 
  +      while ( (value = __input._getValue(pos++)) != ':' && pos < len) {
  +	buf.append(value);	
  +      }
  +    } catch (Exception e){
  +      return 0;
  +    }
  +
  +    if( __input._getValue(pos++) != ']'){
  +      return 0;
  +    }
  +
  +    opcode = __hashPOSIX.get(buf.toString());
  +
  +    if( opcode == null )
  +      return 0;
  +
  +    __input._setOffset(pos);
  +
  +    return ((Character)opcode).charValue();
     }
   
   
  
  
  
  1.3       +7 -3      jakarta-oro/src/java/org/apache/oro/text/regex/Perl5Debug.java
  
  Index: Perl5Debug.java
  ===================================================================
  RCS file: /home/cvs/jakarta-oro/src/java/org/apache/oro/text/regex/Perl5Debug.java,v
  retrieving revision 1.2
  retrieving revision 1.3
  diff -u -r1.2 -r1.3
  --- Perl5Debug.java	2000/07/23 23:25:26	1.2
  +++ Perl5Debug.java	2001/01/29 00:19:01	1.3
  @@ -68,7 +68,7 @@
    * comparison with the program generated by Perl5 with the -r option.
   
    @author <a href="mailto:dfs@savarese.org">Daniel F. Savarese</a>
  - @version $Id: Perl5Debug.java,v 1.2 2000/07/23 23:25:26 jon Exp $
  + @version $Id: Perl5Debug.java,v 1.3 2001/01/29 00:19:01 dfs Exp $
   
    * @see Perl5Pattern
    */
  @@ -121,9 +121,11 @@
   
         if(operator == OpCode._ANYOF) {
   	offset += 16;
  +      } else if(operator == OpCode._ANYOFUN || operator == OpCode._NANYOFUN) {
  +	offset+=(prog[offset-1]-2);
         } else if(operator == OpCode._EXACTLY) {
  -	++offset;
  -	buffer.append(" <");
  +	  ++offset;
  +	  buffer.append(" <");
   
   	//while(prog[offset] != '0')
   	while(prog[offset] != CharStringPointer._END_OF_STRING) {
  @@ -176,6 +178,8 @@
       case OpCode._ANY   : str = "ANY"; break;
       case OpCode._SANY  : str = "SANY"; break;
       case OpCode._ANYOF : str = "ANYOF"; break;
  +    case OpCode._ANYOFUN : str = "ANYOFUN"; break;
  +    case OpCode._NANYOFUN : str = "NANYOFUN"; break;
         /*
       case OpCode._ANYOF : // debug
         buffer.append("ANYOF\n\n");
  
  
  
  1.8       +140 -26   jakarta-oro/src/java/org/apache/oro/text/regex/Perl5Matcher.java
  
  Index: Perl5Matcher.java
  ===================================================================
  RCS file: /home/cvs/jakarta-oro/src/java/org/apache/oro/text/regex/Perl5Matcher.java,v
  retrieving revision 1.7
  retrieving revision 1.8
  diff -u -r1.7 -r1.8
  --- Perl5Matcher.java	2001/01/28 22:21:57	1.7
  +++ Perl5Matcher.java	2001/01/29 00:19:01	1.8
  @@ -66,7 +66,7 @@
    * Perl5Compiler.
   
    @author <a href="mailto:dfs@savarese.org">Daniel F. Savarese</a>
  - @version $Id: Perl5Matcher.java,v 1.7 2001/01/28 22:21:57 dfs Exp $
  + @version $Id: Perl5Matcher.java,v 1.8 2001/01/29 00:19:01 dfs Exp $
   
    * @see PatternMatcher
    * @see Perl5Compiler
  @@ -402,6 +402,7 @@
   
         if((offset = expression._startClassOffset) != OpCode._NULL_OFFSET) {
   	boolean doEvery, tmp;
  +	char op;
   
   	doEvery = ((expression._anchor & Perl5Pattern._OPT_SKIP) == 0);
   
  @@ -410,7 +411,7 @@
   	endOffset -= dontTry;
   	tmp = true;
   
  -	switch(__program[offset]) {
  +	switch(op = __program[offset]) {
   	case OpCode._ANYOF:
   	  offset = OpCode._getOperand(offset);
   	  while(__currentOffset < endOffset) {
  @@ -430,6 +431,25 @@
   
   	  break;
   
  +	case OpCode._ANYOFUN:
  +	case OpCode._NANYOFUN:
  +	  offset = OpCode._getOperand(offset);
  +	  while(__currentOffset < endOffset) {
  +	    ch = __input[__currentOffset];
  +
  +	    if(__matchUnicodeClass(ch, __program, offset, op)) {
  +	      if(tmp && __tryExpression(expression, __currentOffset)) {
  +		success = true;
  +		break _mainLoop;
  +	      } else
  +		tmp = doEvery;
  +	    } else
  +	      tmp = true;
  +	    ++__currentOffset;
  +	  }
  +
  +	  break;
  +
   	case OpCode._BOUND:
   	  if(minLength > 0) {
   	    ++dontTry;
  @@ -603,11 +623,94 @@
   
       return success;
     }
  +  
  +  private boolean __matchUnicodeClass(char code, char __program[], 
  +			     int offset ,char opcode)
  +  {
  +    boolean isANYOF = ( opcode == OpCode._ANYOFUN );
   
  +    while( __program[offset] != OpCode._END ){
  +      if( __program[offset] == OpCode._RANGE ){
  +	offset++;
  +	if((code >= __program[offset]) && (code <= __program[offset+1])){
  +	  return isANYOF;
  +	} else {
  +	  offset+=2;
  +	}
   
  +      } else if( __program[offset] == OpCode._OPCODE ){
  +	offset++;
  +	switch ( __program[offset++] ) {
  +	case OpCode._ALNUM:
  +	  if(OpCode._isWordCharacter(code)) return isANYOF;
  +	  break;
  +	case OpCode._NALNUM:
  +	  if(!OpCode._isWordCharacter(code)) return isANYOF;
  +	  break;
  +	case OpCode._SPACE:
  +	  if(Character.isWhitespace(code)) return isANYOF;
  +	  break;
  +	case OpCode._NSPACE:
  +	  if(!Character.isWhitespace(code)) return isANYOF;
  +	  break;
  +	case OpCode._DIGIT:
  +	  if(Character.isDigit(code)) return isANYOF;
  +	  break;
  +	case OpCode._NDIGIT:
  +	  if(!Character.isDigit(code)) return isANYOF;
  +	  break;
  +	case OpCode._BLANK:
  +	  if(Character.isSpaceChar(code)) return isANYOF;
  +	  break;
  +	case OpCode._CNTRL:
  +	  if(Character.isISOControl(code)) return isANYOF;
  +	  break;
  +	case OpCode._LOWER:
  +	  if(Character.isLowerCase(code)) return isANYOF;
  +	  break;
  +	case OpCode._UPPER:
  +	  if(Character.isUpperCase(code)) return isANYOF;
  +	  break;
  +	case OpCode._PRINT:
  +	  if(Character.isSpaceChar(code)) return isANYOF;
  +          // Fall through to check if the character is alphanumeric,
  +	  // or a punctuation mark.  Printable characters are either
  +	  // alphanumeric, punctuation marks, or spaces.
  +	case OpCode._GRAPH:
  +	  if(Character.isLetterOrDigit(code)
  +	     return isANYOF;
  +          // Fall through to check if the character is a punctuation mark.
  +          // Graph characters are either alphanumeric or punctuation.
  +	case OpCode._PUNCT:
  +	  switch ( Character.getType(code) ) {
  +	    case Character.DASH_PUNCTUATION:
  +	    case Character.START_PUNCTUATION:
  +	    case Character.END_PUNCTUATION:
  +	    case Character.CONNECTOR_PUNCTUATION:
  +	    case Character.OTHER_PUNCTUATION:
  +	      return isANYOF;
  +	    default:
  +	      break;
  +	    }
  +	  break;
  +	case OpCode._XDIGIT:
  +	  if( (code >= '0' && code <= '9') ||
  +	      (code >= 'a' && code <= 'f') ||
  +	      (code >= 'A' && code <= 'F')) return isANYOF;
  +	  break;
  +	}
  +      } else if((__program[offset++] == OpCode._ONECHAR) &&
  +		(__program[offset++] == code))
  +	{
  +	  return isANYOF;
  +	}
  +    }
  +    return !isANYOF;
  +  }
  +  
     private boolean __tryExpression(Perl5Pattern expression, int offset) {
       int count;
  -
  +    
       __inputOffset = offset;
       __lastParen   = 0;
       __expSize     = 0;
  @@ -632,6 +735,7 @@
     private int __repeat(int offset, int max) {
       int scan, eol, operand, ret;
       char ch;
  +    char op;
   
       scan = __inputOffset;
       eol  = __eol;
  @@ -641,7 +745,7 @@
   
       operand = OpCode._getOperand(offset);
   
  -    switch(__program[offset]) {
  +    switch(op = __program[offset]) {
   
       case OpCode._ANY:
         while(scan < eol && __input[scan] != '\n')
  @@ -660,7 +764,20 @@
   
       case OpCode._ANYOF:
         if(scan < eol && (ch = __input[scan]) < 256) {
  -	while((__program[operand + (ch >> 4)] & (1 << (ch & 0xf))) == 0) {
  +	while((ch < 256  ) && (__program[operand + (ch >> 4)] & (1 << (ch & 0xf))) == 0) {
  +	  if(++scan < eol)
  +	    ch = __input[scan];
  +	  else
  +	    break;
  +	}
  +      }
  +      break;
  +
  +    case OpCode._ANYOFUN:
  +    case OpCode._NANYOFUN:
  +      if(scan < eol) {
  +	ch = __input[scan];
  +	while(__matchUnicodeClass(ch, __program, operand, op)){
   	  if(++scan < eol)
   	    ch = __input[scan];
   	  else
  @@ -820,6 +937,23 @@
   	nextChar = (inputRemains ? __input[input] : __EOS);
   	break;
   
  +      case OpCode._ANYOFUN:
  +      case OpCode._NANYOFUN:
  +	current = OpCode._getOperand(scan);
  +
  +	if(nextChar == __EOS && inputRemains)
  +	  nextChar = __input[input];
  +
  +	if(!__matchUnicodeClass(nextChar, __program, current, op))
  +	  return false;
  +
  +	if(!inputRemains && input >= __eol)
  +	  return false;
  +
  +	inputRemains = (++input < __endOffset);
  +	nextChar = (inputRemains ? __input[input] : __EOS);
  +	break;
  +
         case OpCode._ALNUM:
   	if(!inputRemains)
   	  return false;
  @@ -1389,13 +1523,7 @@
       __originalInput = input;
       if(expression._isCaseInsensitive)
         input = _toLower(input);
  -    /*    
  -    if(__interpret(expression, input, 0, input.length)) {
  -      if(__lastMatchResult.beginOffset(0) == 0 &&
  -	 __lastMatchResult.endOffset(0) == input.length)
  -	return true;
  -    }
  -    */
  +
       __initInterpreterGlobals(expression, input, 0, input.length, 0);
       __lastSuccess = (__tryExpression(expression, 0) &&
   		     __endMatchOffsets[0] == input.length);
  @@ -1549,20 +1677,6 @@
      *         Perl5Pattern is passed as the pattern parameter.
      */
     public boolean contains(String input, Pattern pattern) {
  -    /*
  -    char[] inp;
  -    Perl5Pattern expression;
  -
  -    expression = (Perl5Pattern)pattern;
  -
  -    __originalInput = inp = input.toCharArray();
  -
  -    if(expression._isCaseInsensitive)
  -      //_toLower(inp, false);
  -      inp = _toLower(inp, false);
  -
  -    return __interpret(expression, inp, 0, inp.length);
  -    */
       return contains(input.toCharArray(), pattern);
     }