You are viewing a plain text version of this content. The canonical link for it is here.
Posted to regexp-dev@jakarta.apache.org by vg...@apache.org on 2004/03/20 15:35:42 UTC

cvs commit: jakarta-regexp/src/java/org/apache/regexp RE.java RETest.java

vgritsenko    2004/03/20 06:35:42

  Modified:    docs     jakarta-regexp.jar
               src/java/org/apache/regexp RE.java RETest.java
  Log:
  Applied patch from Oleg Sukhodolsky: reduce code duplication, add method for
  character comparison.
  
  Revision  Changes    Path
  1.2       +53 -64    jakarta-regexp/docs/jakarta-regexp.jar
  
  	<<Binary file>>
  
  
  1.22      +121 -143  jakarta-regexp/src/java/org/apache/regexp/RE.java
  
  Index: RE.java
  ===================================================================
  RCS file: /home/cvs/jakarta-regexp/src/java/org/apache/regexp/RE.java,v
  retrieving revision 1.21
  retrieving revision 1.22
  diff -u -r1.21 -r1.22
  --- RE.java	27 Feb 2004 02:41:20 -0000	1.21
  +++ RE.java	20 Mar 2004 14:35:42 -0000	1.22
  @@ -121,14 +121,14 @@
    *    [:cntrl:]            Control characters.
    *    [:digit:]            Numeric characters.
    *    [:graph:]            Characters that are printable and are also visible.
  - *                         (A space is printable, but not visible, while an 
  + *                         (A space is printable, but not visible, while an
    *                         `a' is both.)
    *    [:lower:]            Lower-case alphabetic characters.
  - *    [:print:]            Printable characters (characters that are not 
  + *    [:print:]            Printable characters (characters that are not
    *                         control characters.)
    *    [:punct:]            Punctuation characters (characters that are not letter,
    *                         digits, control characters, or space characters).
  - *    [:space:]            Space characters (such as space, tab, and formfeed, 
  + *    [:space:]            Space characters (such as space, tab, and formfeed,
    *                         to name a few).
    *    [:upper:]            Upper-case alphabetic characters.
    *    [:xdigit:]           Characters that are hexadecimal digits.
  @@ -181,7 +181,7 @@
    *    AB        Matches A followed by B
    *    A|B       Matches either A or B
    *    (A)       Used for subexpression grouping
  - *   (?:A)      Used for subexpression clustering (just like grouping but 
  + *   (?:A)      Used for subexpression clustering (just like grouping but
    *              no backrefs)
    *
    *
  @@ -411,6 +411,7 @@
        * Constructs a regular expression matcher from a String by compiling it
        * using a new instance of RECompiler.  If you will be compiling many
        * expressions, you may prefer to use a single RECompiler object instead.
  +     *
        * @param pattern The regular expression pattern to compile.
        * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
        * @see RECompiler
  @@ -425,6 +426,7 @@
        * Constructs a regular expression matcher from a String by compiling it
        * using a new instance of RECompiler.  If you will be compiling many
        * expressions, you may prefer to use a single RECompiler object instead.
  +     *
        * @param pattern The regular expression pattern to compile.
        * @param matchFlags The matching style
        * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
  @@ -441,15 +443,14 @@
        * Construct a matcher for a pre-compiled regular expression from program
        * (bytecode) data.  Permits special flags to be passed in to modify matching
        * behaviour.
  +     *
        * @param program Compiled regular expression program (see RECompiler and/or recompile)
        * @param matchFlags One or more of the RE match behaviour flags (RE.MATCH_*):
        *
        * <pre>
  -     *
        *   MATCH_NORMAL              // Normal (case-sensitive) matching
        *   MATCH_CASEINDEPENDENT     // Case folded comparisons
        *   MATCH_MULTILINE           // Newline matches as BOL/EOL
  -     *
        * </pre>
        *
        * @see RECompiler
  @@ -465,6 +466,7 @@
       /**
        * Construct a matcher for a pre-compiled regular expression from program
        * (bytecode) data.
  +     *
        * @param program Compiled regular expression program
        * @see RECompiler
        * @see recompile
  @@ -485,6 +487,7 @@
   
       /**
        * Converts a 'simplified' regular expression to a full regular expression
  +     *
        * @param pattern The pattern to convert
        * @return The full regular expression
        */
  @@ -527,13 +530,10 @@
        * @param matchFlags One or more of the RE match behaviour flags (RE.MATCH_*):
        *
        * <pre>
  -     *
        *   MATCH_NORMAL              // Normal (case-sensitive) matching
        *   MATCH_CASEINDEPENDENT     // Case folded comparisons
        *   MATCH_MULTILINE           // Newline matches as BOL/EOL
  -     *
        * </pre>
  -     *
        */
       public void setMatchFlags(int matchFlags)
       {
  @@ -545,15 +545,12 @@
        * @return Current match behaviour flags (RE.MATCH_*).
        *
        * <pre>
  -     *
        *   MATCH_NORMAL              // Normal (case-sensitive) matching
        *   MATCH_CASEINDEPENDENT     // Case folded comparisons
        *   MATCH_MULTILINE           // Newline matches as BOL/EOL
  -     *
        * </pre>
        *
        * @see #setMatchFlags
  -     *
        */
       public int getMatchFlags()
       {
  @@ -562,6 +559,7 @@
   
       /**
        * Sets the current regular expression program used by this matcher object.
  +     *
        * @param program Regular expression program compiled by RECompiler.
        * @see RECompiler
        * @see REProgram
  @@ -579,6 +577,7 @@
   
       /**
        * Returns the current regular expression program in use by this matcher object.
  +     *
        * @return Regular expression program
        * @see #setProgram
        */
  @@ -589,6 +588,7 @@
   
       /**
        * Returns the number of parenthesized subexpressions available after a successful match.
  +     *
        * @return Number of available parenthesized subexpressions
        */
       public int getParenCount()
  @@ -598,6 +598,7 @@
   
       /**
        * Gets the contents of a parenthesized subexpression after a successful match.
  +     *
        * @param which Nesting level of subexpression
        * @return String
        */
  @@ -613,8 +614,9 @@
   
       /**
        * Returns the start index of a given paren level.
  +     *
        * @param which Nesting level of subexpression
  -     * @return String index 
  +     * @return String index
        */
       public final int getParenStart(int which)
       {
  @@ -624,13 +626,13 @@
               {
                   case 0:
                       return start0;
  -                    
  +
                   case 1:
                       return start1;
  -                    
  +
                   case 2:
                       return start2;
  -                    
  +
                   default:
                       if (startn == null)
                       {
  @@ -644,8 +646,9 @@
   
       /**
        * Returns the end index of a given paren level.
  +     *
        * @param which Nesting level of subexpression
  -     * @return String index 
  +     * @return String index
        */
       public final int getParenEnd(int which)
       {
  @@ -655,13 +658,13 @@
               {
                   case 0:
                       return end0;
  -                    
  +
                   case 1:
                       return end1;
  -                    
  +
                   case 2:
                       return end2;
  -                    
  +
                   default:
                       if (endn == null)
                       {
  @@ -675,6 +678,7 @@
   
       /**
        * Returns the length of a given paren level.
  +     *
        * @param which Nesting level of subexpression
        * @return Number of characters in the parenthesized subexpression
        */
  @@ -689,6 +693,7 @@
   
       /**
        * Sets the start of a paren level
  +     *
        * @param which Which paren level
        * @param i Index in input array
        */
  @@ -701,15 +706,15 @@
                   case 0:
                       start0 = i;
                       break;
  -                    
  +
                   case 1:
                       start1 = i;
                       break;
  -                    
  +
                   case 2:
                       start2 = i;
                       break;
  -                    
  +
                   default:
                       if (startn == null)
                       {
  @@ -723,6 +728,7 @@
   
       /**
        * Sets the end of a paren level
  +     *
        * @param which Which paren level
        * @param i Index in input array
        */
  @@ -735,15 +741,15 @@
                   case 0:
                       end0 = i;
                       break;
  -                    
  +
                   case 1:
                       end1 = i;
                       break;
  -                    
  +
                   case 2:
                       end2 = i;
                       break;
  -                    
  +
                   default:
                       if (endn == null)
                       {
  @@ -759,6 +765,7 @@
        * Throws an Error representing an internal error condition probably resulting
        * from a bug in the regular expression compiler (or possibly data corruption).
        * In practice, this should be very rare.
  +     *
        * @param s Error description
        */
       protected void internalError(String s) throws Error
  @@ -785,10 +792,11 @@
   
       /**
        * Try to match a string against a subset of nodes in the program
  +     *
        * @param firstNode Node to start at in program
  -     * @param lastNode Last valid node (used for matching a subexpression without
  -     * matching the rest of the program as well).
  -     * @param idxStart Starting position in character array
  +     * @param lastNode  Last valid node (used for matching a subexpression without
  +     *                  matching the rest of the program as well).
  +     * @param idxStart  Starting position in character array
        * @return Final input array index if match succeeded.  -1 if not.
        */
       protected int matchNodes(int firstNode, int lastNode, int idxStart)
  @@ -925,26 +933,14 @@
                           }
   
                           // Case fold the backref?
  -                        if ((matchFlags & MATCH_CASEINDEPENDENT) != 0)
  -                        {
  -                            // Compare backref to input, case-folding as we go
  -                            for (int i = 0; i < l; i++)
  -                            {
  -                                if (Character.toLowerCase(search.charAt(idx++)) != Character.toLowerCase(search.charAt(s + i)))
  -                                {
  -                                    return -1;
  -                                }
  -                            }
  -                        }
  -                        else
  +                        final boolean caseFold =
  +                            ((matchFlags & MATCH_CASEINDEPENDENT) != 0);
  +                        // Compare backref to input
  +                        for (int i = 0; i < l; i++)
                           {
  -                            // Compare backref to input
  -                            for (int i = 0; i < l; i++)
  +                            if (compareChars(search.charAt(idx++), search.charAt(s + i), caseFold) != 0)
                               {
  -                                if (search.charAt(idx++) != search.charAt(s + i))
  -                                {
  -                                    return -1;
  -                                }
  +                                return -1;
                               }
                           }
                       }
  @@ -1096,24 +1092,14 @@
                           }
   
                           // Match atom differently depending on casefolding flag
  -                        if ((matchFlags & MATCH_CASEINDEPENDENT) != 0)
  -                        {
  -                            for (int i = 0; i < lenAtom; i++)
  -                            {
  -                                if (Character.toLowerCase(search.charAt(idx++)) != Character.toLowerCase(instruction[startAtom + i]))
  -                                {
  -                                    return -1;
  -                                }
  -                            }
  -                        }
  -                        else
  +                        final boolean caseFold =
  +                            ((matchFlags & MATCH_CASEINDEPENDENT) != 0);
  +
  +                        for (int i = 0; i < lenAtom; i++)
                           {
  -                            for (int i = 0; i < lenAtom; i++)
  +                            if (compareChars(search.charAt(idx++), instruction[startAtom + i], caseFold) != 0)
                               {
  -                                if (search.charAt(idx++) != instruction[startAtom + i])
  -                                {
  -                                    return -1;
  -                                }
  +                                return -1;
                               }
                           }
                       }
  @@ -1126,7 +1112,7 @@
                           {
                               return -1;
                           }
  -                        
  +
                           switch (opdata)
                           {
                               case POSIX_CLASS_ALNUM:
  @@ -1135,42 +1121,42 @@
                                       return -1;
                                   }
                                   break;
  -                                
  +
                               case POSIX_CLASS_ALPHA:
                                   if (!Character.isLetter(search.charAt(idx)))
                                   {
                                       return -1;
                                   }
                                   break;
  -                                
  +
                               case POSIX_CLASS_DIGIT:
                                   if (!Character.isDigit(search.charAt(idx)))
                                   {
                                       return -1;
                                   }
                                   break;
  -                                
  +
                               case POSIX_CLASS_BLANK: // JWL - bugbug: is this right??
                                   if (!Character.isSpaceChar(search.charAt(idx)))
                                   {
                                       return -1;
                                   }
                                   break;
  -                                
  +
                               case POSIX_CLASS_SPACE:
                                   if (!Character.isWhitespace(search.charAt(idx)))
                                   {
                                       return -1;
                                   }
                                   break;
  -                                
  +
                               case POSIX_CLASS_CNTRL:
                                   if (Character.getType(search.charAt(idx)) != Character.CONTROL)
                                   {
                                       return -1;
                                   }
                                   break;
  -                                
  +
                               case POSIX_CLASS_GRAPH: // JWL - bugbug???
                                   switch (Character.getType(search.charAt(idx)))
                                   {
  @@ -1179,33 +1165,33 @@
                                       case Character.MODIFIER_SYMBOL:
                                       case Character.OTHER_SYMBOL:
                                           break;
  -                                        
  +
                                       default:
                                           return -1;
                                   }
                                   break;
  -                                
  +
                               case POSIX_CLASS_LOWER:
                                   if (Character.getType(search.charAt(idx)) != Character.LOWERCASE_LETTER)
                                   {
                                       return -1;
                                   }
                                   break;
  -                                
  +
                               case POSIX_CLASS_UPPER:
                                   if (Character.getType(search.charAt(idx)) != Character.UPPERCASE_LETTER)
                                   {
                                       return -1;
                                   }
                                   break;
  -                                
  +
                               case POSIX_CLASS_PRINT:
                                   if (Character.getType(search.charAt(idx)) == Character.CONTROL)
                                   {
                                       return -1;
                                   }
                                   break;
  -                                
  +
                               case POSIX_CLASS_PUNCT:
                               {
                                   int type = Character.getType(search.charAt(idx));
  @@ -1217,7 +1203,7 @@
                                       case Character.CONNECTOR_PUNCTUATION:
                                       case Character.OTHER_PUNCTUATION:
                                           break;
  -                                        
  +
                                       default:
                                           return -1;
                                   }
  @@ -1235,14 +1221,14 @@
                                   }
                               }
                               break;
  -                            
  +
                               case POSIX_CLASS_JSTART:
                                   if (!Character.isJavaIdentifierStart(search.charAt(idx)))
                                   {
                                       return -1;
                                   }
                                   break;
  -                                
  +
                               case POSIX_CLASS_JPART:
                                   if (!Character.isJavaIdentifierPart(search.charAt(idx)))
                                   {
  @@ -1254,7 +1240,7 @@
                                   internalError("Bad posix class");
                                   break;
                           }
  -                    
  +
                           // Matched.
                           idx++;
                       }
  @@ -1271,34 +1257,18 @@
                           // Get character to match against character class and maybe casefold
                           char c = search.charAt(idx);
                           boolean caseFold = (matchFlags & MATCH_CASEINDEPENDENT) != 0;
  -                        if (caseFold)
  -                        {
  -                            c = Character.toLowerCase(c);
  -                        }
  -
                           // Loop through character class checking our match character
                           int idxRange = node + nodeSize;
                           int idxEnd = idxRange + (opdata * 2);
                           boolean match = false;
  -                        for (int i = idxRange; i < idxEnd; )
  +                        for (int i = idxRange; !match && i < idxEnd; )
                           {
                               // Get start, end and match characters
                               char s = instruction[i++];
                               char e = instruction[i++];
   
  -                            // Fold ends of range and match character
  -                            if (caseFold)
  -                            {
  -                                s = Character.toLowerCase(s);
  -                                e = Character.toLowerCase(e);
  -                            }
  -
  -                            // If the match character is in range, break out
  -                            if (c >= s && c <= e)
  -                            {
  -                                match = true;
  -                                break;
  -                            }
  +                            match = ((compareChars(c, s, caseFold) >= 0)
  +                                     && (compareChars(c, e, caseFold) <= 0));
                           }
   
                           // Fail if we didn't match the character class
  @@ -1329,7 +1299,7 @@
                           {
                               return idxNew;
                           }
  -                        
  +
                           // Go to next branch (if any)
                           nextBranch = (short)instruction[node + offsetNext];
                           node += nextBranch;
  @@ -1371,6 +1341,7 @@
        * Match the current regular expression program against the current
        * input string, starting at index i of the input string.  This method
        * is only meant for internal use.
  +     *
        * @param i The input string index to start matching at
        * @return True if the input matched the expression
        */
  @@ -1411,11 +1382,12 @@
       /**
        * Matches the current regular expression program against a character array,
        * starting at a given index.
  +     *
        * @param search String to match against
        * @param i Index to start searching at
        * @return True if string matched
        */
  -    public boolean match(String search, int i) 
  +    public boolean match(String search, int i)
       {
           return match(new StringCharacterIterator(search), i);
       }
  @@ -1423,6 +1395,7 @@
       /**
        * Matches the current regular expression program against a character array,
        * starting at a given index.
  +     *
        * @param search String to match against
        * @param i Index to start searching at
        * @return True if string matched
  @@ -1459,44 +1432,25 @@
               // Prefix-anchored matching is possible
               boolean caseIndependent = (matchFlags & MATCH_CASEINDEPENDENT) != 0;
               char[] prefix = program.prefix;
  -            for ( ;! search.isEnd(i + prefix.length - 1); i++)
  +            for ( ; !search.isEnd(i + prefix.length - 1); i++)
               {
  -                // If the first character of the prefix matches
  -                boolean match = false;
  -                if (caseIndependent)
  -                    match = Character.toLowerCase(search.charAt(i)) == Character.toLowerCase(prefix[0]);
  -                else
  -                    match = search.charAt(i) == prefix[0];
  -                if (match)
  -                {
  -                    // Save first character position
  -                    int firstChar = i++;
  -                    int k;
  -                    for (k = 1; k < prefix.length; )
  -                    {
  -                        // If there's a mismatch of any character in the prefix, give up
  -                        if (caseIndependent)
  -                            match = Character.toLowerCase(search.charAt(i++)) == Character.toLowerCase(prefix[k++]);
  -                        else
  -                            match = search.charAt(i++) == prefix[k++];
  -                        if (!match)
  -                        {
  -                            break;
  -                        }
  -                    }
  +                int j = i;
  +                int k = 0;
   
  -                    // See if the whole prefix string matched
  -                    if (k == prefix.length)
  +                boolean match;
  +                do {
  +                    // If there's a mismatch of any character in the prefix, give up
  +                    match = (compareChars(search.charAt(j++), prefix[k++], caseIndependent) == 0);
  +                } while (match && k < prefix.length);
  +
  +                // See if the whole prefix string matched
  +                if (k == prefix.length)
  +                {
  +                    // We matched the full prefix at firstChar, so try it
  +                    if (matchAt(i))
                       {
  -                        // We matched the full prefix at firstChar, so try it
  -                        if (matchAt(firstChar))
  -                        {
  -                            return true;
  -                        }
  +                        return true;
                       }
  -
  -                    // Match failed, reset i to continue the search
  -                    i = firstChar;
                   }
               }
               return false;
  @@ -1505,6 +1459,7 @@
   
       /**
        * Matches the current regular expression program against a String.
  +     *
        * @param search String to match against
        * @return True if string matched
        */
  @@ -1520,7 +1475,7 @@
        * "xyzzyababbayyzabbbab123", the result would be the array of Strings
        * "[xyzzy, yyz, 123]".
        *
  -     * Please note that the first string in the resulting array may be an empty
  +     * <p>Please note that the first string in the resulting array may be an empty
        * string. This happens when the very first character of input string is
        * matched by the pattern.
        *
  @@ -1620,7 +1575,7 @@
        * with $0, $1, ... $9. A regular expression of "http://[\\.\\w\\-\\?/~_@&=%]+",
        * a String to substituteIn of "visit us: http://www.apache.org!" and the
        * substitution String "&lt;a href=\"$0\"&gt;$0&lt;/a&gt;", the resulting String
  -     * returned by subst would be 
  +     * returned by subst would be
        * "visit us: &lt;a href=\"http://www.apache.org\"&gt;http://www.apache.org&lt;/a&gt;!".
        * <p>
        * <i>Note:</i> $0 represents the whole match.
  @@ -1705,7 +1660,7 @@
               // Move forward, skipping past match
               int newpos = getParenEnd(0);
   
  -            // We always want to make progress! 
  +            // We always want to make progress!
               if (newpos == pos)
               {
                   newpos++;
  @@ -1727,16 +1682,17 @@
               ret.append(substituteIn.substring(pos));
           }
   
  -        // Return string buffer as string 
  +        // Return string buffer as string
           return ret.toString();
  -    }  
  +    }
   
       /**
        * Returns an array of Strings, whose toString representation matches a regular
        * expression. This method works like the Perl function of the same name.  Given
        * a regular expression of "a*b" and an array of String objects of [foo, aab, zzz,
        * aaaab], the array of Strings returned by grep would be [aab, aaaab].
  -     * @param search Array of Objects to search 
  +     *
  +     * @param search Array of Objects to search
        * @return Array of Strings whose toString() value matches this regular expression.
        */
       public String[] grep(Object[] search)
  @@ -1763,8 +1719,11 @@
           return ret;
       }
   
  -    /** @return true if at the i-th position in the 'search' a newline ends */
  -    private boolean isNewline(int i) {
  +    /**
  +     * @return true if character at i-th position in the <code>search</code> string is a newline
  +     */
  +    private boolean isNewline(int i)
  +    {
           char nextChar = search.charAt(i);
   
           if (nextChar == '\n' || nextChar == '\r' || nextChar == '\u0085'
  @@ -1774,5 +1733,24 @@
           }
   
           return false;
  +    }
  +
  +    /**
  +     * Compares two characters.
  +     *
  +     * @param c1 first character to compare.
  +     * @param c2 second character to compare.
  +     * @param caseIndependent whether comparision is case insensitive or not.
  +     * @return negative, 0, or positive integer as the first character
  +     *         less than, equal to, or greater then the second.
  +     */
  +    private int compareChars(char c1, char c2, boolean caseIndependent)
  +    {
  +        if (caseIndependent)
  +        {
  +            c1 = Character.toLowerCase(c1);
  +            c2 = Character.toLowerCase(c2);
  +        }
  +        return ((int)c1 - (int)c2);
       }
   }
  
  
  
  1.14      +24 -4     jakarta-regexp/src/java/org/apache/regexp/RETest.java
  
  Index: RETest.java
  ===================================================================
  RCS file: /home/cvs/jakarta-regexp/src/java/org/apache/regexp/RETest.java,v
  retrieving revision 1.13
  retrieving revision 1.14
  diff -u -r1.13 -r1.14
  --- RETest.java	27 Feb 2004 02:41:20 -0000	1.13
  +++ RETest.java	20 Mar 2004 14:35:42 -0000	1.14
  @@ -358,6 +358,26 @@
               showParens(r);
           }
   
  +        r = new RE("(A*)b\\1");
  +        r.setMatchFlags(RE.MATCH_CASEINDEPENDENT);
  +        if (!r.match("AaAaaaBAAAAAA"))
  +        {
  +            fail("Did not match 'AaAaaaBAAAAAA'.");
  +        } else {
  +            say("AaAaaaBAAAAAA = true");
  +            showParens(r);
  +        }
  +
  +        r = new RE("[A-Z]*");
  +        r.setMatchFlags(RE.MATCH_CASEINDEPENDENT);
  +        if (!r.match("CaBgDe12"))
  +        {
  +            fail("Did not match 'CaBgDe12'.");
  +        } else {
  +            say("CaBgDe12 = true");
  +            showParens(r);
  +        }
  +
           // Test MATCH_MULTILINE. Test for eol/bol symbols.
           r = new RE("^abc$", RE.MATCH_MULTILINE);
           if (!r.match("\nabc")) {
  @@ -602,7 +622,7 @@
           boolean shouldMatch = false;
           int expectedParenCount = 0;
           String[] expectedParens = null;
  -        
  +
           if (!badPattern) {
               shouldMatch = getExpectedResult(br.readLine().trim());
               if (shouldMatch) {
  @@ -769,7 +789,7 @@
       private boolean checkParens()
       {
           // Show subexpression registers
  -        if (test.showSuccesses)
  +        if (RETest.showSuccesses)
           {
               test.showParens(regexp);
           }
  @@ -850,7 +870,7 @@
        */
       void success(String s)
       {
  -        if (test.showSuccesses)
  +        if (RETest.showSuccesses)
           {
               test.say("" + RETest.NEW_LINE + "-----------------------" + RETest.NEW_LINE + "");
               test.say("Expression #" + (number) + " \"" + pattern + "\" ");
  
  
  

---------------------------------------------------------------------
To unsubscribe, e-mail: regexp-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: regexp-dev-help@jakarta.apache.org