You are viewing a plain text version of this content. The canonical link for it is here.
Posted to oro-dev@jakarta.apache.org by df...@apache.org on 2001/05/17 20:59:51 UTC

cvs commit: jakarta-oro/src/java/org/apache/oro/text/regex Perl5Substitution.java

dfs         01/05/17 11:59:50

  Modified:    src/java/org/apache/oro/text/regex Perl5Substitution.java
  Log:
  Applied Mark Murphy's patch that adds case modification support to
  substitution processing.  Also updated the class documentation to
  reflect the new behavior.  Also started process of updating javadoc
  version tags and adding since tags to javadocs, moving the RCS Id tag
  to the top of the file (well, after the license).
  
  Revision  Changes    Path
  1.5       +244 -83   jakarta-oro/src/java/org/apache/oro/text/regex/Perl5Substitution.java
  
  Index: Perl5Substitution.java
  ===================================================================
  RCS file: /home/cvs/jakarta-oro/src/java/org/apache/oro/text/regex/Perl5Substitution.java,v
  retrieving revision 1.4
  retrieving revision 1.5
  diff -u -r1.4 -r1.5
  --- Perl5Substitution.java	2001/05/09 18:22:07	1.4
  +++ Perl5Substitution.java	2001/05/17 18:59:47	1.5
  @@ -57,6 +57,9 @@
    * by Daniel F. Savarese. We appreciate his contributions.
    */
   
  +/*
  + * $Id: Perl5Substitution.java,v 1.5 2001/05/17 18:59:47 dfs Exp $
  + */
   import java.util.*;
   
   /**
  @@ -68,7 +71,7 @@
    * The substitution string may contain variable interpolations referring
    * to the saved parenthesized groups of the search pattern.
    * A variable interpolation is denoted by <b>$1</b>, or <b>$2</b>,
  - * or <b>$3</b>, etc.  If you don want such expressions to be
  + * or <b>$3</b>, etc.  If you do not want such expressions to be
    * interpreted literally, you should set the <b> numInterpolations </b>
    * parameter to <b> INTERPOLATE_NONE </b>.  It is easiest to explain
    * what an interpolated variable does by giving an example:
  @@ -105,15 +108,31 @@
    * <p>
    * A final thing to keep in mind is that if you use an interpolation variable
    * that corresponds to a group not contained in the match, then it is
  - * interpreted literally.  So given the regular expression from the
  + * interpreted as the empty string.  So given the regular expression from the
    * example, and a substitution expression of <b>a$2-</b>, the result
    * of the last sample input would be:
  - * <pre><b>Tank a$2- 85  Tank a$2- 32  Tank a$2- 22</b></pre>
  - * Also, <b>$0</b> is always interpreted literally.
  -
  - @author <a href="mailto:oro-dev@jakarta.apache.org">Daniel F. Savarese</a>
  - @version $Id: Perl5Substitution.java,v 1.4 2001/05/09 18:22:07 dfs Exp $
  -
  + * <pre><b>Tank a- 85  Tank a- 32  Tank a- 22</b></pre>
  + * Also, the result of substituting <b>$0</b> or $ followed by an
  + * non-positive integer is undefined.  In order to include a $ in a
  + * substitution, it should be escaped with a backslash (e.g., <b>"\\$0"</b>).
  + * <p>
  + * Perl5 double-quoted string case modification is also supported in
  + * the substitution.  The following escape sequences are supported:
  + * <dl compact>
  + *  <dt> \\U <dd> make substitution uppercase until end of substitution or \\E
  + *  <dt> \\u <dd> make next character uppercase
  + *  <dt> \\L <dd> make substitution uppercase until end of substitution or \\E
  + *  <dt> \\l <dd> make next character uppercase
  + *  <dt> \\E <dd> mark the end of the case modification
  + * </dl>
  + * The double backslashes are shown to remind you that to make a
  + * backslash get past Java's string handling and appear as a backslash
  + * to the substitution, you must escape the backslash.
  + *
  + * @author <a href="mailto:oro-dev@jakarta.apache.org">Daniel F. Savarese</a>
  + * @author <a href="mailto:oro-dev@jakarta.apache.org">Mark Murphy</a>
  + * @version @version@
  + * @since 1.1
    * @see Substitution
    * @see Util
    * @see Util#substitute
  @@ -135,70 +154,162 @@
      */
     public static final int INTERPOLATE_NONE = -1;
   
  +  /**
  +   * The initial size and unit of growth for the
  +   * {@link #_subOpCodes _subOpCodes} array.
  +   */
  +  private static final int __OPCODE_STORAGE_SIZE = 32;
  +
  +  /**
  +   * The maximum number of groups supported by interpolation.
  +   */
  +  private static final int __MAX_GROUPS = Character.MAX_VALUE;
  +
  +  /**
  +   * A constant declaring opcode for copy operation.
  +   */
  +  static final int _OPCODE_COPY            = -1;
  +
  +  /**
  +   * A constant declaring opcode for lowercase char operation.
  +   */
  +  static final int _OPCODE_LOWERCASE_CHAR  = -2;
  +
  +  /**
  +   * A constant declaring opcode for uppercase char operation.
  +   */
  +  static final int _OPCODE_UPPERCASE_CHAR  = -3;
  +
  +  /**
  +   * A constant declaring opcode for lowercase mode operation.
  +   */
  +  static final int _OPCODE_LOWERCASE_MODE  = -4;
  +
  +  /**
  +   * A constant declaring opcode for lowercase mode operation.
  +   */
  +  static final int _OPCODE_UPPERCASE_MODE  = -5;
  +
  +  /**
  +   * A constant declaring opcode for lowercase mode operation.
  +   */
  +  static final int _OPCODE_ENDCASE_MODE    = -6;
  +  
     int _numInterpolations;
  -  ArrayList _substitutions;
  +  int[] _subOpcodes;
  +  int _subOpcodesCount;
  +  char[] _substitutionChars;
  +
     transient String _lastInterpolation;
   
  -  static ArrayList _parseSubs(String sub) {
  -    boolean saveDigits, storedInterpolation;
  -    int current;
  -    char[] str;
  -    ArrayList subs;
  -    StringBuffer numBuffer, strBuffer;
  -
  -    subs = new ArrayList(5);
  -    numBuffer = new StringBuffer(5);
  -    strBuffer = new StringBuffer(10);
  +  private void __addElement(int value) {
  +    int len = _subOpcodes.length;
  +    if (_subOpcodesCount == len) {
  +      int[] newarray = new int[len + __OPCODE_STORAGE_SIZE];
  +      System.arraycopy(_subOpcodes, 0, newarray, 0, len);
  +      _subOpcodes = newarray;
  +    }
  +    _subOpcodes[_subOpcodesCount++] = value;
  +  }
   
  -    str = sub.toCharArray();
  -    current = 0;
  +  private void _parseSubs(String sub) {
  +    boolean saveDigits, escapeMode, caseMode;
  +    int posParam;
  +    int offset;
  +
  +    char[] subChars = _substitutionChars = sub.toCharArray();
  +    int subLength = subChars.length;
  +
  +    _subOpcodes = new int[__OPCODE_STORAGE_SIZE];
  +    _subOpcodesCount = 0;
  +    
  +    posParam = 0;
  +    offset = -1;
       saveDigits = false;
  -    storedInterpolation = false;
  +    escapeMode = false;
  +    caseMode = false;
   
  -    while(current < str.length) {
  -      if(saveDigits && Character.isDigit(str[current])) {
  -	numBuffer.append(str[current]);
  -
  -	if(strBuffer.length() > 0) {
  -	  subs.add(strBuffer.toString());
  -	  strBuffer.setLength(0);
  -	}
  -      } else {
  -	if(saveDigits) {
  -	  try {
  -	    subs.add(new Integer(numBuffer.toString()));
  -	    storedInterpolation = true;
  -	  } catch(NumberFormatException e) {
  -	    subs.add(numBuffer.toString());
  +    for (int current = 0; current < subLength; current++) {
  +      char c = subChars[current];
  +      char nextc;
  +      int next = current + 1;
  +        
  +      // Save digits
  +      if (saveDigits) {
  +	int digit = Character.digit(c, 10);
  +	if (digit > -1) {
  +	  if (posParam <= __MAX_GROUPS) {
  +	    posParam *= 10;
  +	    posParam += digit;
   	  }
  -
  -	  numBuffer.setLength(0);
  -	  saveDigits = false;
  +	  if (next == subLength) {
  +	    __addElement(posParam);
  +	  }
  +	  continue;
   	}
  +	__addElement(posParam);
  +	posParam = 0;
  +	saveDigits = false;
  +      }
   
  -	if(str[current] == '$' &&
  -	   current + 1 < str.length && str[current + 1] != '0' &&
  -	   Character.isDigit(str[current + 1]))
  -	  saveDigits = true;
  -	else
  -	  strBuffer.append(str[current]);
  +      if ((c != '$' && c != '\\') || escapeMode) {
  +	escapeMode = false;
  +	if (offset < 0) {
  +	  offset = current;
  +	  __addElement(_OPCODE_COPY);
  +	  __addElement(offset);
  +	}
  +	if (next == subLength) {
  +	  __addElement(next - offset);
  +	}
  +	continue;
         }
   
  -      ++current;
  -    } // end while
   
  +      if (offset >= 0) {
  +	__addElement(current - offset);
  +	offset = -1;
  +      }
   
  -    if(saveDigits) {
  -      try {
  -	subs.add(new Integer(numBuffer.toString()));
  -	storedInterpolation = true;
  -      } catch(NumberFormatException e) {
  -	subs.add(numBuffer.toString());
  +      // Only do positional and escapes if we have a next char
  +      if (next == subLength)
  +	continue;
  +      nextc = subChars[next];
  +
  +      // Positional params
  +      if (c == '$') {
  +	saveDigits = (nextc != '0' && Character.isDigit(nextc));
         }
  -    } else if(strBuffer.length() > 0)
  -      subs.add(strBuffer.toString());
   
  -    return (storedInterpolation ? subs : null);
  +      // Escape codes
  +      else if (c == '\\') {
  +	if (nextc == 'l') {
  +	  if (!caseMode){
  +	    __addElement(_OPCODE_LOWERCASE_CHAR);
  +	    current++;
  +	  }
  +	} else if (nextc == 'u') {
  +	  if (!caseMode) {
  +	    __addElement(_OPCODE_UPPERCASE_CHAR);
  +	    current++;
  +	  }
  +	} else if (nextc == 'L') {
  +	  __addElement(_OPCODE_LOWERCASE_MODE);
  +	  current++;
  +	  caseMode = true;
  +	} else if (nextc == 'U') {
  +	  __addElement(_OPCODE_UPPERCASE_MODE);
  +	  current++;
  +	  caseMode = true;
  +	} else if (nextc == 'E') {
  +	  __addElement(_OPCODE_ENDCASE_MODE);
  +	  current++;
  +	  caseMode = false;
  +	} else {
  +	  escapeMode = true;
  +	}
  +      }
  +    }
     }
   
   
  @@ -209,33 +320,83 @@
     }
   
     void _calcSub(StringBuffer buffer, MatchResult result) {
  -    int size, value;
  -    Object obj;
  -    Integer integer;
  +    int size, offset, count, caseMode;
       String group;
  -    Iterator it;
  -
  -    it = _substitutions.iterator();
  -
  -    while(it.hasNext()) {
  -      obj = it.next();
  +    char[] sub, str, match;
  +    int[] subOpcodes = _subOpcodes;
   
  -      if(obj instanceof String)
  -	buffer.append(obj);
  -      else {
  -	integer = (Integer)obj;
  -	value = integer.intValue();
  +    caseMode = 0;
   
  -	if(value > 0 && value < result.groups()) {
  -	  group = result.group(value);
  +    str = _substitutionChars;
  +    match = result.group(0).toCharArray();
   
  -	  if(group != null)
  -	    buffer.append(group);
  -	} else {
  -	  buffer.append('$');
  -	  buffer.append(value);
  +    size = _subOpcodesCount;
  +
  +    for (int element = 0; element < size; element++) {
  +      int value = subOpcodes[element];
  +
  +      // If we have a group, set up interpolation, else
  +      // interpret op code.
  +      if(value > 0 && value < result.groups()) {
  +	int end, len;
  +	offset = result.begin(value);
  +	
  +	if (offset < 0) continue;
  +
  +	end = result.end(value);
  +
  +	if (end < 0) continue;
  +
  +	len = result.length();
  +
  +	if (offset >= len || end > len || offset >= end) continue;
  +
  +	count = end - offset;
  +	sub = match;
  +      } else if (value == _OPCODE_COPY) {
  +	element++;
  +	if (element >= size) continue;
  +	offset = subOpcodes[element];
  +	element++;
  +	if (element >= size) continue;
  +	count = subOpcodes[element];
  +	sub = str;
  +      } else if (value == _OPCODE_LOWERCASE_CHAR ||
  +		 value == _OPCODE_UPPERCASE_CHAR) {
  +	  if (caseMode != _OPCODE_LOWERCASE_MODE &&
  +	      caseMode != _OPCODE_UPPERCASE_MODE)
  +	      caseMode = value;
  +	  continue;
  +      } else if (value == _OPCODE_LOWERCASE_MODE ||
  +		 value == _OPCODE_UPPERCASE_MODE) {
  +	caseMode = value;
  +	continue;
  +      } else if (value == _OPCODE_ENDCASE_MODE) {
  +	caseMode = 0;
  +	continue;
  +      } else
  +	continue;
  +
  +      // Apply modes to buf
  +      if (caseMode == _OPCODE_LOWERCASE_CHAR) {
  +	buffer.append(Character.toLowerCase(sub[offset++]));
  +	buffer.append(sub, offset, --count);
  +	caseMode = 0;
  +      } else if (caseMode == _OPCODE_UPPERCASE_CHAR) {
  +	buffer.append(Character.toUpperCase(sub[offset++]));
  +	buffer.append(sub, offset, --count);
  +	caseMode = 0;
  +      } else if (caseMode == _OPCODE_LOWERCASE_MODE) {
  +	for (int end = offset + count; offset < end; ) {
  +	  buffer.append(Character.toLowerCase(sub[offset++]));
   	}
  -      }
  +      } else if (caseMode == _OPCODE_UPPERCASE_MODE) {
  +	for (int end = offset + count; offset < end; ) {
  +	  buffer.append(Character.toUpperCase(sub[offset++]));
  +	}
  +      } else
  +	buffer.append(sub, offset, count);
  +        
       }
     }
   
  @@ -324,10 +485,10 @@
       _numInterpolations = numInterpolations;
   
       if(numInterpolations != INTERPOLATE_NONE && 
  -       substitution.indexOf('$') != -1)
  -      _substitutions = _parseSubs(substitution);
  +       (substitution.indexOf('$') != -1 || substitution.indexOf('\\') != -1))
  +      _parseSubs(substitution);
       else
  -      _substitutions = null;
  +      _subOpcodes = null;
       _lastInterpolation = null;
     }
   
  @@ -353,7 +514,7 @@
   				 int substitutionCount, String originalInput, 
   				 PatternMatcher matcher, Pattern pattern)
     {
  -    if(_substitutions == null) {
  +    if(_subOpcodes == null) {
         super.appendSubstitution(appendBuffer, match, substitutionCount,
   			       originalInput, matcher, pattern);
         return;