You are viewing a plain text version of this content. The canonical link for it is here.
Posted to regexp-dev@jakarta.apache.org by vg...@apache.org on 2004/03/20 15:35:42 UTC
cvs commit: jakarta-regexp/src/java/org/apache/regexp RE.java RETest.java
vgritsenko 2004/03/20 06:35:42
Modified: docs jakarta-regexp.jar
src/java/org/apache/regexp RE.java RETest.java
Log:
Applied patch from Oleg Sukhodolsky: reduce code duplication, add method for
character comparison.
Revision Changes Path
1.2 +53 -64 jakarta-regexp/docs/jakarta-regexp.jar
<<Binary file>>
1.22 +121 -143 jakarta-regexp/src/java/org/apache/regexp/RE.java
Index: RE.java
===================================================================
RCS file: /home/cvs/jakarta-regexp/src/java/org/apache/regexp/RE.java,v
retrieving revision 1.21
retrieving revision 1.22
diff -u -r1.21 -r1.22
--- RE.java 27 Feb 2004 02:41:20 -0000 1.21
+++ RE.java 20 Mar 2004 14:35:42 -0000 1.22
@@ -121,14 +121,14 @@
* [:cntrl:] Control characters.
* [:digit:] Numeric characters.
* [:graph:] Characters that are printable and are also visible.
- * (A space is printable, but not visible, while an
+ * (A space is printable, but not visible, while an
* `a' is both.)
* [:lower:] Lower-case alphabetic characters.
- * [:print:] Printable characters (characters that are not
+ * [:print:] Printable characters (characters that are not
* control characters.)
* [:punct:] Punctuation characters (characters that are not letter,
* digits, control characters, or space characters).
- * [:space:] Space characters (such as space, tab, and formfeed,
+ * [:space:] Space characters (such as space, tab, and formfeed,
* to name a few).
* [:upper:] Upper-case alphabetic characters.
* [:xdigit:] Characters that are hexadecimal digits.
@@ -181,7 +181,7 @@
* AB Matches A followed by B
* A|B Matches either A or B
* (A) Used for subexpression grouping
- * (?:A) Used for subexpression clustering (just like grouping but
+ * (?:A) Used for subexpression clustering (just like grouping but
* no backrefs)
*
*
@@ -411,6 +411,7 @@
* Constructs a regular expression matcher from a String by compiling it
* using a new instance of RECompiler. If you will be compiling many
* expressions, you may prefer to use a single RECompiler object instead.
+ *
* @param pattern The regular expression pattern to compile.
* @exception RESyntaxException Thrown if the regular expression has invalid syntax.
* @see RECompiler
@@ -425,6 +426,7 @@
* Constructs a regular expression matcher from a String by compiling it
* using a new instance of RECompiler. If you will be compiling many
* expressions, you may prefer to use a single RECompiler object instead.
+ *
* @param pattern The regular expression pattern to compile.
* @param matchFlags The matching style
* @exception RESyntaxException Thrown if the regular expression has invalid syntax.
@@ -441,15 +443,14 @@
* Construct a matcher for a pre-compiled regular expression from program
* (bytecode) data. Permits special flags to be passed in to modify matching
* behaviour.
+ *
* @param program Compiled regular expression program (see RECompiler and/or recompile)
* @param matchFlags One or more of the RE match behaviour flags (RE.MATCH_*):
*
* <pre>
- *
* MATCH_NORMAL // Normal (case-sensitive) matching
* MATCH_CASEINDEPENDENT // Case folded comparisons
* MATCH_MULTILINE // Newline matches as BOL/EOL
- *
* </pre>
*
* @see RECompiler
@@ -465,6 +466,7 @@
/**
* Construct a matcher for a pre-compiled regular expression from program
* (bytecode) data.
+ *
* @param program Compiled regular expression program
* @see RECompiler
* @see recompile
@@ -485,6 +487,7 @@
/**
* Converts a 'simplified' regular expression to a full regular expression
+ *
* @param pattern The pattern to convert
* @return The full regular expression
*/
@@ -527,13 +530,10 @@
* @param matchFlags One or more of the RE match behaviour flags (RE.MATCH_*):
*
* <pre>
- *
* MATCH_NORMAL // Normal (case-sensitive) matching
* MATCH_CASEINDEPENDENT // Case folded comparisons
* MATCH_MULTILINE // Newline matches as BOL/EOL
- *
* </pre>
- *
*/
public void setMatchFlags(int matchFlags)
{
@@ -545,15 +545,12 @@
* @return Current match behaviour flags (RE.MATCH_*).
*
* <pre>
- *
* MATCH_NORMAL // Normal (case-sensitive) matching
* MATCH_CASEINDEPENDENT // Case folded comparisons
* MATCH_MULTILINE // Newline matches as BOL/EOL
- *
* </pre>
*
* @see #setMatchFlags
- *
*/
public int getMatchFlags()
{
@@ -562,6 +559,7 @@
/**
* Sets the current regular expression program used by this matcher object.
+ *
* @param program Regular expression program compiled by RECompiler.
* @see RECompiler
* @see REProgram
@@ -579,6 +577,7 @@
/**
* Returns the current regular expression program in use by this matcher object.
+ *
* @return Regular expression program
* @see #setProgram
*/
@@ -589,6 +588,7 @@
/**
* Returns the number of parenthesized subexpressions available after a successful match.
+ *
* @return Number of available parenthesized subexpressions
*/
public int getParenCount()
@@ -598,6 +598,7 @@
/**
* Gets the contents of a parenthesized subexpression after a successful match.
+ *
* @param which Nesting level of subexpression
* @return String
*/
@@ -613,8 +614,9 @@
/**
* Returns the start index of a given paren level.
+ *
* @param which Nesting level of subexpression
- * @return String index
+ * @return String index
*/
public final int getParenStart(int which)
{
@@ -624,13 +626,13 @@
{
case 0:
return start0;
-
+
case 1:
return start1;
-
+
case 2:
return start2;
-
+
default:
if (startn == null)
{
@@ -644,8 +646,9 @@
/**
* Returns the end index of a given paren level.
+ *
* @param which Nesting level of subexpression
- * @return String index
+ * @return String index
*/
public final int getParenEnd(int which)
{
@@ -655,13 +658,13 @@
{
case 0:
return end0;
-
+
case 1:
return end1;
-
+
case 2:
return end2;
-
+
default:
if (endn == null)
{
@@ -675,6 +678,7 @@
/**
* Returns the length of a given paren level.
+ *
* @param which Nesting level of subexpression
* @return Number of characters in the parenthesized subexpression
*/
@@ -689,6 +693,7 @@
/**
* Sets the start of a paren level
+ *
* @param which Which paren level
* @param i Index in input array
*/
@@ -701,15 +706,15 @@
case 0:
start0 = i;
break;
-
+
case 1:
start1 = i;
break;
-
+
case 2:
start2 = i;
break;
-
+
default:
if (startn == null)
{
@@ -723,6 +728,7 @@
/**
* Sets the end of a paren level
+ *
* @param which Which paren level
* @param i Index in input array
*/
@@ -735,15 +741,15 @@
case 0:
end0 = i;
break;
-
+
case 1:
end1 = i;
break;
-
+
case 2:
end2 = i;
break;
-
+
default:
if (endn == null)
{
@@ -759,6 +765,7 @@
* Throws an Error representing an internal error condition probably resulting
* from a bug in the regular expression compiler (or possibly data corruption).
* In practice, this should be very rare.
+ *
* @param s Error description
*/
protected void internalError(String s) throws Error
@@ -785,10 +792,11 @@
/**
* Try to match a string against a subset of nodes in the program
+ *
* @param firstNode Node to start at in program
- * @param lastNode Last valid node (used for matching a subexpression without
- * matching the rest of the program as well).
- * @param idxStart Starting position in character array
+ * @param lastNode Last valid node (used for matching a subexpression without
+ * matching the rest of the program as well).
+ * @param idxStart Starting position in character array
* @return Final input array index if match succeeded. -1 if not.
*/
protected int matchNodes(int firstNode, int lastNode, int idxStart)
@@ -925,26 +933,14 @@
}
// Case fold the backref?
- if ((matchFlags & MATCH_CASEINDEPENDENT) != 0)
- {
- // Compare backref to input, case-folding as we go
- for (int i = 0; i < l; i++)
- {
- if (Character.toLowerCase(search.charAt(idx++)) != Character.toLowerCase(search.charAt(s + i)))
- {
- return -1;
- }
- }
- }
- else
+ final boolean caseFold =
+ ((matchFlags & MATCH_CASEINDEPENDENT) != 0);
+ // Compare backref to input
+ for (int i = 0; i < l; i++)
{
- // Compare backref to input
- for (int i = 0; i < l; i++)
+ if (compareChars(search.charAt(idx++), search.charAt(s + i), caseFold) != 0)
{
- if (search.charAt(idx++) != search.charAt(s + i))
- {
- return -1;
- }
+ return -1;
}
}
}
@@ -1096,24 +1092,14 @@
}
// Match atom differently depending on casefolding flag
- if ((matchFlags & MATCH_CASEINDEPENDENT) != 0)
- {
- for (int i = 0; i < lenAtom; i++)
- {
- if (Character.toLowerCase(search.charAt(idx++)) != Character.toLowerCase(instruction[startAtom + i]))
- {
- return -1;
- }
- }
- }
- else
+ final boolean caseFold =
+ ((matchFlags & MATCH_CASEINDEPENDENT) != 0);
+
+ for (int i = 0; i < lenAtom; i++)
{
- for (int i = 0; i < lenAtom; i++)
+ if (compareChars(search.charAt(idx++), instruction[startAtom + i], caseFold) != 0)
{
- if (search.charAt(idx++) != instruction[startAtom + i])
- {
- return -1;
- }
+ return -1;
}
}
}
@@ -1126,7 +1112,7 @@
{
return -1;
}
-
+
switch (opdata)
{
case POSIX_CLASS_ALNUM:
@@ -1135,42 +1121,42 @@
return -1;
}
break;
-
+
case POSIX_CLASS_ALPHA:
if (!Character.isLetter(search.charAt(idx)))
{
return -1;
}
break;
-
+
case POSIX_CLASS_DIGIT:
if (!Character.isDigit(search.charAt(idx)))
{
return -1;
}
break;
-
+
case POSIX_CLASS_BLANK: // JWL - bugbug: is this right??
if (!Character.isSpaceChar(search.charAt(idx)))
{
return -1;
}
break;
-
+
case POSIX_CLASS_SPACE:
if (!Character.isWhitespace(search.charAt(idx)))
{
return -1;
}
break;
-
+
case POSIX_CLASS_CNTRL:
if (Character.getType(search.charAt(idx)) != Character.CONTROL)
{
return -1;
}
break;
-
+
case POSIX_CLASS_GRAPH: // JWL - bugbug???
switch (Character.getType(search.charAt(idx)))
{
@@ -1179,33 +1165,33 @@
case Character.MODIFIER_SYMBOL:
case Character.OTHER_SYMBOL:
break;
-
+
default:
return -1;
}
break;
-
+
case POSIX_CLASS_LOWER:
if (Character.getType(search.charAt(idx)) != Character.LOWERCASE_LETTER)
{
return -1;
}
break;
-
+
case POSIX_CLASS_UPPER:
if (Character.getType(search.charAt(idx)) != Character.UPPERCASE_LETTER)
{
return -1;
}
break;
-
+
case POSIX_CLASS_PRINT:
if (Character.getType(search.charAt(idx)) == Character.CONTROL)
{
return -1;
}
break;
-
+
case POSIX_CLASS_PUNCT:
{
int type = Character.getType(search.charAt(idx));
@@ -1217,7 +1203,7 @@
case Character.CONNECTOR_PUNCTUATION:
case Character.OTHER_PUNCTUATION:
break;
-
+
default:
return -1;
}
@@ -1235,14 +1221,14 @@
}
}
break;
-
+
case POSIX_CLASS_JSTART:
if (!Character.isJavaIdentifierStart(search.charAt(idx)))
{
return -1;
}
break;
-
+
case POSIX_CLASS_JPART:
if (!Character.isJavaIdentifierPart(search.charAt(idx)))
{
@@ -1254,7 +1240,7 @@
internalError("Bad posix class");
break;
}
-
+
// Matched.
idx++;
}
@@ -1271,34 +1257,18 @@
// Get character to match against character class and maybe casefold
char c = search.charAt(idx);
boolean caseFold = (matchFlags & MATCH_CASEINDEPENDENT) != 0;
- if (caseFold)
- {
- c = Character.toLowerCase(c);
- }
-
// Loop through character class checking our match character
int idxRange = node + nodeSize;
int idxEnd = idxRange + (opdata * 2);
boolean match = false;
- for (int i = idxRange; i < idxEnd; )
+ for (int i = idxRange; !match && i < idxEnd; )
{
// Get start, end and match characters
char s = instruction[i++];
char e = instruction[i++];
- // Fold ends of range and match character
- if (caseFold)
- {
- s = Character.toLowerCase(s);
- e = Character.toLowerCase(e);
- }
-
- // If the match character is in range, break out
- if (c >= s && c <= e)
- {
- match = true;
- break;
- }
+ match = ((compareChars(c, s, caseFold) >= 0)
+ && (compareChars(c, e, caseFold) <= 0));
}
// Fail if we didn't match the character class
@@ -1329,7 +1299,7 @@
{
return idxNew;
}
-
+
// Go to next branch (if any)
nextBranch = (short)instruction[node + offsetNext];
node += nextBranch;
@@ -1371,6 +1341,7 @@
* Match the current regular expression program against the current
* input string, starting at index i of the input string. This method
* is only meant for internal use.
+ *
* @param i The input string index to start matching at
* @return True if the input matched the expression
*/
@@ -1411,11 +1382,12 @@
/**
* Matches the current regular expression program against a character array,
* starting at a given index.
+ *
* @param search String to match against
* @param i Index to start searching at
* @return True if string matched
*/
- public boolean match(String search, int i)
+ public boolean match(String search, int i)
{
return match(new StringCharacterIterator(search), i);
}
@@ -1423,6 +1395,7 @@
/**
* Matches the current regular expression program against a character array,
* starting at a given index.
+ *
* @param search String to match against
* @param i Index to start searching at
* @return True if string matched
@@ -1459,44 +1432,25 @@
// Prefix-anchored matching is possible
boolean caseIndependent = (matchFlags & MATCH_CASEINDEPENDENT) != 0;
char[] prefix = program.prefix;
- for ( ;! search.isEnd(i + prefix.length - 1); i++)
+ for ( ; !search.isEnd(i + prefix.length - 1); i++)
{
- // If the first character of the prefix matches
- boolean match = false;
- if (caseIndependent)
- match = Character.toLowerCase(search.charAt(i)) == Character.toLowerCase(prefix[0]);
- else
- match = search.charAt(i) == prefix[0];
- if (match)
- {
- // Save first character position
- int firstChar = i++;
- int k;
- for (k = 1; k < prefix.length; )
- {
- // If there's a mismatch of any character in the prefix, give up
- if (caseIndependent)
- match = Character.toLowerCase(search.charAt(i++)) == Character.toLowerCase(prefix[k++]);
- else
- match = search.charAt(i++) == prefix[k++];
- if (!match)
- {
- break;
- }
- }
+ int j = i;
+ int k = 0;
- // See if the whole prefix string matched
- if (k == prefix.length)
+ boolean match;
+ do {
+ // If there's a mismatch of any character in the prefix, give up
+ match = (compareChars(search.charAt(j++), prefix[k++], caseIndependent) == 0);
+ } while (match && k < prefix.length);
+
+ // See if the whole prefix string matched
+ if (k == prefix.length)
+ {
+ // We matched the full prefix at firstChar, so try it
+ if (matchAt(i))
{
- // We matched the full prefix at firstChar, so try it
- if (matchAt(firstChar))
- {
- return true;
- }
+ return true;
}
-
- // Match failed, reset i to continue the search
- i = firstChar;
}
}
return false;
@@ -1505,6 +1459,7 @@
/**
* Matches the current regular expression program against a String.
+ *
* @param search String to match against
* @return True if string matched
*/
@@ -1520,7 +1475,7 @@
* "xyzzyababbayyzabbbab123", the result would be the array of Strings
* "[xyzzy, yyz, 123]".
*
- * Please note that the first string in the resulting array may be an empty
+ * <p>Please note that the first string in the resulting array may be an empty
* string. This happens when the very first character of input string is
* matched by the pattern.
*
@@ -1620,7 +1575,7 @@
* with $0, $1, ... $9. A regular expression of "http://[\\.\\w\\-\\?/~_@&=%]+",
* a String to substituteIn of "visit us: http://www.apache.org!" and the
* substitution String "<a href=\"$0\">$0</a>", the resulting String
- * returned by subst would be
+ * returned by subst would be
* "visit us: <a href=\"http://www.apache.org\">http://www.apache.org</a>!".
* <p>
* <i>Note:</i> $0 represents the whole match.
@@ -1705,7 +1660,7 @@
// Move forward, skipping past match
int newpos = getParenEnd(0);
- // We always want to make progress!
+ // We always want to make progress!
if (newpos == pos)
{
newpos++;
@@ -1727,16 +1682,17 @@
ret.append(substituteIn.substring(pos));
}
- // Return string buffer as string
+ // Return string buffer as string
return ret.toString();
- }
+ }
/**
* Returns an array of Strings, whose toString representation matches a regular
* expression. This method works like the Perl function of the same name. Given
* a regular expression of "a*b" and an array of String objects of [foo, aab, zzz,
* aaaab], the array of Strings returned by grep would be [aab, aaaab].
- * @param search Array of Objects to search
+ *
+ * @param search Array of Objects to search
* @return Array of Strings whose toString() value matches this regular expression.
*/
public String[] grep(Object[] search)
@@ -1763,8 +1719,11 @@
return ret;
}
- /** @return true if at the i-th position in the 'search' a newline ends */
- private boolean isNewline(int i) {
+ /**
+ * @return true if character at i-th position in the <code>search</code> string is a newline
+ */
+ private boolean isNewline(int i)
+ {
char nextChar = search.charAt(i);
if (nextChar == '\n' || nextChar == '\r' || nextChar == '\u0085'
@@ -1774,5 +1733,24 @@
}
return false;
+ }
+
+ /**
+ * Compares two characters.
+ *
+ * @param c1 first character to compare.
+ * @param c2 second character to compare.
+ * @param caseIndependent whether comparision is case insensitive or not.
+ * @return negative, 0, or positive integer as the first character
+ * less than, equal to, or greater then the second.
+ */
+ private int compareChars(char c1, char c2, boolean caseIndependent)
+ {
+ if (caseIndependent)
+ {
+ c1 = Character.toLowerCase(c1);
+ c2 = Character.toLowerCase(c2);
+ }
+ return ((int)c1 - (int)c2);
}
}
1.14 +24 -4 jakarta-regexp/src/java/org/apache/regexp/RETest.java
Index: RETest.java
===================================================================
RCS file: /home/cvs/jakarta-regexp/src/java/org/apache/regexp/RETest.java,v
retrieving revision 1.13
retrieving revision 1.14
diff -u -r1.13 -r1.14
--- RETest.java 27 Feb 2004 02:41:20 -0000 1.13
+++ RETest.java 20 Mar 2004 14:35:42 -0000 1.14
@@ -358,6 +358,26 @@
showParens(r);
}
+ r = new RE("(A*)b\\1");
+ r.setMatchFlags(RE.MATCH_CASEINDEPENDENT);
+ if (!r.match("AaAaaaBAAAAAA"))
+ {
+ fail("Did not match 'AaAaaaBAAAAAA'.");
+ } else {
+ say("AaAaaaBAAAAAA = true");
+ showParens(r);
+ }
+
+ r = new RE("[A-Z]*");
+ r.setMatchFlags(RE.MATCH_CASEINDEPENDENT);
+ if (!r.match("CaBgDe12"))
+ {
+ fail("Did not match 'CaBgDe12'.");
+ } else {
+ say("CaBgDe12 = true");
+ showParens(r);
+ }
+
// Test MATCH_MULTILINE. Test for eol/bol symbols.
r = new RE("^abc$", RE.MATCH_MULTILINE);
if (!r.match("\nabc")) {
@@ -602,7 +622,7 @@
boolean shouldMatch = false;
int expectedParenCount = 0;
String[] expectedParens = null;
-
+
if (!badPattern) {
shouldMatch = getExpectedResult(br.readLine().trim());
if (shouldMatch) {
@@ -769,7 +789,7 @@
private boolean checkParens()
{
// Show subexpression registers
- if (test.showSuccesses)
+ if (RETest.showSuccesses)
{
test.showParens(regexp);
}
@@ -850,7 +870,7 @@
*/
void success(String s)
{
- if (test.showSuccesses)
+ if (RETest.showSuccesses)
{
test.say("" + RETest.NEW_LINE + "-----------------------" + RETest.NEW_LINE + "");
test.say("Expression #" + (number) + " \"" + pattern + "\" ");
---------------------------------------------------------------------
To unsubscribe, e-mail: regexp-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: regexp-dev-help@jakarta.apache.org