You are viewing a plain text version of this content. The canonical link for it is here.
Posted to oro-dev@jakarta.apache.org by df...@locus.apache.org on 2000/09/15 07:17:27 UTC
cvs commit: jakarta-oro/src/java/org/apache/oro/text/perl Perl5Util.java

dfs         00/09/14 22:17:27

  Modified:    src/java/org/apache/oro/text/regex Util.java
               src/java/org/apache/oro/text/perl Perl5Util.java
  Log:
  Changed split method to accept a List argument to which results are
  appended rather than create a new Vector each time.  This makes
  it easier to reduce memory allocation overheads through the reuse of List
  instances.
  
  Revision  Changes    Path
  1.3       +117 -20   jakarta-oro/src/java/org/apache/oro/text/regex/Util.java
  
  Index: Util.java
  ===================================================================
  RCS file: /home/cvs/jakarta-oro/src/java/org/apache/oro/text/regex/Util.java,v
  retrieving revision 1.2
  retrieving revision 1.3
  diff -u -r1.2 -r1.3
  --- Util.java	2000/07/23 23:25:27	1.2
  +++ Util.java	2000/09/15 05:17:24	1.3
  @@ -85,7 +85,7 @@
    * </ol>
   
    @author <a href="mailto:dfs@savarese.org">Daniel F. Savarese</a>
  - @version $Id: Util.java,v 1.2 2000/07/23 23:25:27 jon Exp $
  + @version $Id: Util.java,v 1.3 2000/09/15 05:17:24 dfs Exp $
   
    * @see Pattern
    * @see PatternMatcher
  @@ -111,7 +111,114 @@
      */
     private Util() { }
   
  +
  +  /**
  +   * Splits up a <code>String</code> instance and stores results as a
  +   * <code>List</code> of substrings numbering no more than a specified
  +   * limit.  The string is split with a regular expression as the delimiter. 
  +   * The <b>limit</b> parameter essentially says to split the
  +   * string only on at most the first <b>limit - 1</b> number of pattern
  +   * occurences.
  +   * <p>
  +   * This method is inspired by the Perl split() function and behaves 
  +   * identically to it when used in conjunction with the Perl5Matcher and
  +   * Perl5Pattern classes except for the following difference:
  +   * <ul><p>
  +   * In Perl, if the split expression contains parentheses, the split()
  +   * method creates additional list elements from each of the matching
  +   * subgroups in the pattern.  In other words:
  +   * <ul><p>
  +   * <code>split(list, "/([,-])/", "8-12,15,18", Util.SPLIT_ALL)</code></ul>
  +   * <p> produces the list containing:
  +   * <ul><p><code> { "8", "-", "12", ",", "15", ",", "18" } </code> </ul>
  +   * <p> The OROMatcher split method does not follow this behavior.  The
  +   * following list would be produced by OROMatcher:
  +   * <ul><p><code> { "8", "12",  "15", "18" } </code> </ul>
  +   * <p> To obtain the Perl behavior, use
  +   * {@link org.apache.oro.text.perl.Perl5Util#split}.
  +   * </ul>
  +   * <p>
  +   * @param results A List to which the split results are appended.
  +   *         After the method returns, it contains the substrings of the input
  +   *         that occur between the regular expression delimiter occurences.
  +   *         The input will not be split into any more substrings than the
  +   *         specified <code>limit</code>.  A way of thinking of this is that
  +   *         only the first <code>limit - 1</code> matches of the delimiting
  +   *         regular expression will be used to split the input.
  +   * @param matcher The regular expression matcher to execute the split.
  +   * @param pattern The regular expression to use as a split delimiter.
  +   * @param input  The <code>String</code> to split.
  +   * @param limit  The limit on the number of resulting split elements.
  +   *               Values <= 0 produce the same behavior as using the
  +   *               <b>SPLIT_ALL</b> constant which causes the limit to be 
  +   *               ignored and splits to be performed on all occurrences of
  +   *               the pattern.  You should use the <b>SPLIT_ALL</b> constant
  +   *               to achieve this behavior instead of relying on the default
  +   *               behavior associated with non-positive limit values.
  +   * @return A <code>Vector</code>
  +   */
  +  public static void split(List results, PatternMatcher matcher,
  +			   Pattern pattern, String input, int limit)
  +  {
  +    int beginOffset;
  +    MatchResult currentResult;
  +    PatternMatcherInput pinput;
  +
  +    pinput = new PatternMatcherInput(input);
  +    beginOffset = 0;
  +
  +    while(--limit != 0 && matcher.contains(pinput, pattern)) {
  +      currentResult = matcher.getMatch();
  +      results.add(input.substring(beginOffset,
  +				  currentResult.beginOffset(0)));
  +      beginOffset = currentResult.endOffset(0);
  +    }
  +
  +    results.add(input.substring(beginOffset, input.length()));
  +  }
  +
  +
     /**
  +   * Splits up a <code>String</code> instance and stores results as a
  +   * <code>List</code> of all its substrings using a regular expression
  +   * as the delimiter.
  +   * This method is inspired by the Perl split() function and behaves 
  +   * identically to it when used in conjunction with the Perl5Matcher and
  +   * Perl5Pattern classes except for the following difference:
  +   * <p>
  +   * <ul>
  +   * In Perl, if the split expression contains parentheses, the split()
  +   * method creates additional list elements from each of the matching
  +   * subgroups in the pattern.  In other words:
  +   * <ul><p><code>split(list, "/([,-])/", "8-12,15,18")</code></ul>
  +   * <p> produces the list containing: 
  +   * <ul><p><code> { "8", "-", "12", ",", "15", ",", "18" } </code> </ul>
  +   * <p> The OROMatcher split method does not follow this behavior.  The
  +   * following list would be produced by OROMatcher:
  +   * <ul><p><code> { "8", "12",  "15", "18" } </code> </ul>
  +   * <p> To obtain the Perl behavior, use
  +   * {@link org.apache.oro.text.perl.Perl5Util#split}.
  +   * </ul>
  +   * <p>
  +   * This method is identical to calling:
  +   * <blockquote><pre>
  +   * split(matcher, pattern, input, Util.SPLIT_ALL);
  +   * </pre></blockquote>
  +   * <p>
  +   * @param results A <code>List</code> to which all the substrings of
  +   *         the input that occur between the regular expression delimiter
  +   *         occurences are appended.
  +   * @param matcher The regular expression matcher to execute the split.
  +   * @param pattern The regular expression to use as a split delimiter.
  +   * @param input  The <code>String</code> to split.
  +   */
  +  public static void split(List results,  PatternMatcher matcher,
  +			   Pattern pattern, String input)
  +  {
  +    split(results, matcher, pattern, input, SPLIT_ALL);
  +  }
  +
  +  /**
      * Splits up a <code>String</code> instance into strings contained in a
      * <code>Vector</code> of size not greater than a specified limit.  The
      * string is split with a regular expression as the delimiter. 
  @@ -132,11 +239,12 @@
      * <p> The OROMatcher split method does not follow this behavior.  The
      * following Vector would be produced by OROMatcher:
      * <ul><p><code> { "8", "12",  "15", "18" } </code> </ul>
  -   * <p> To obtain the Perl behavior, use split method in the PerlTools
  -   * package available from
  -   * <a href="http://www.oroinc.com/"> http://www.oroinc.com/ </a>.
  +   * <p> To obtain the Perl behavior, use
  +   * {@link org.apache.oro.text.perl.Perl5Util#split}.
      * </ul>
      * <p>
  +   * @deprecated Use {@link #split(List, PatternMatcher, Pattern, String, int)}
  +   *             instead.
      * @param matcher The regular expression matcher to execute the split.
      * @param pattern The regular expression to use as a split delimiter.
      * @param input  The <code>String</code> to split.
  @@ -157,21 +265,9 @@
     public static Vector split(PatternMatcher matcher, Pattern pattern,
   			     String input, int limit)
     {
  -    int beginOffset;
       Vector results = new Vector(20); 
  -    MatchResult currentResult;
  -    PatternMatcherInput pinput;
  -
  -    pinput = new PatternMatcherInput(input);
  -    beginOffset = 0;
   
  -    while(--limit != 0 && matcher.contains(pinput, pattern)) {
  -      currentResult = matcher.getMatch();
  -      results.addElement(input.substring(beginOffset,
  -					 currentResult.beginOffset(0)));
  -      beginOffset = currentResult.endOffset(0);
  -    }
  -    results.addElement(input.substring(beginOffset, input.length()));
  +    split(results, matcher, pattern, input, limit);
   
       return results;
     }
  @@ -194,9 +290,8 @@
      * <p> The OROMatcher split method does not follow this behavior.  The
      * following Vector would be produced by OROMatcher:
      * <ul><p><code> { "8", "12",  "15", "18" } </code> </ul>
  -   * <p> To obtain the Perl behavior, use split method in the PerlTools
  -   * package available from
  -   * <a href="http://www.oroinc.com/"> http://www.oroinc.com/ </a>.
  +   * <p> To obtain the Perl behavior, use
  +   * {@link org.apache.oro.text.perl.Perl5Util#split}.
      * </ul>
      * <p>
      * This method is identical to calling:
  @@ -204,6 +299,8 @@
      * split(matcher, pattern, input, Util.SPLIT_ALL);
      * </pre></blockquote>
      * <p>
  +   * @deprecated Use {@link #split(List, PatternMatcher, Pattern, String)}
  +   *             instead.
      * @param matcher The regular expression matcher to execute the split.
      * @param pattern The regular expression to use as a split delimiter.
      * @param input  The <code>String</code> to split.
  
  
  
  1.3       +116 -1    jakarta-oro/src/java/org/apache/oro/text/perl/Perl5Util.java
  
  Index: Perl5Util.java
  ===================================================================
  RCS file: /home/cvs/jakarta-oro/src/java/org/apache/oro/text/perl/Perl5Util.java,v
  retrieving revision 1.2
  retrieving revision 1.3
  diff -u -r1.2 -r1.3
  --- Perl5Util.java	2000/07/23 23:25:23	1.2
  +++ Perl5Util.java	2000/09/15 05:17:26	1.3
  @@ -148,7 +148,7 @@
    * (or even a split, but this isn't particularly useful).
   
    @author <a href="mailto:dfs@savarese.org">Daniel F. Savarese</a>
  - @version $Id: Perl5Util.java,v 1.2 2000/07/23 23:25:23 jon Exp $
  + @version $Id: Perl5Util.java,v 1.3 2000/09/15 05:17:26 dfs Exp $
   
    * @see MalformedPerl5PatternException
    * @see org.apache.oro.text.PatternCache
  @@ -687,6 +687,116 @@
   
   
     /**
  +   * Splits a String into strings that are appended to a List, but no more
  +   * than a specified limit.  The String is split using a regular expression
  +   * as the delimiter.  The regular expression is a pattern specified
  +   * in Perl5 native format:
  +   * <blockquote><pre>
  +   * [m]/pattern/[i][m][s][x]
  +   * </pre></blockquote>
  +   * The <code>m</code> prefix is optional and the meaning of the optional
  +   * trailing options are:
  +   * <dl compact> 
  +   * <dt> i <dd> case insensitive match
  +   * <dt> m <dd> treat the input as consisting of multiple lines
  +   * <dt> s <dd> treat the input as consisting of a single line
  +   * <dt> x <dd> enable extended expression syntax incorporating whitespace
  +   *             and comments
  +   * </dl>
  +   * As with Perl, any non-alphanumeric character can be used in lieu of
  +   * the slashes.
  +   * <p>
  +   * The limit parameter causes the string to be split on at most the first
  +   * <b>limit - 1</b> number of pattern occurences.
  +   * <p>
  +   * Of special note is that this split method performs EXACTLY the same
  +   * as the Perl split() function.  In other words, if the split pattern
  +   * contains parentheses, additional Vector elements are created from
  +   * each of the matching subgroups in the pattern.  Using an example
  +   * similar to the one from the Camel book:
  +   * <blockquote><pre>
  +   * split(list, "/([,-])/", "8-12,15,18")
  +   * </pre></blockquote>
  +   * produces the Vector containing:
  +   * <blockquote><pre>
  +   * { "8", "-", "12", ",", "15", ",", "18" }
  +   * </pre></blockquote>
  +   * The Util.split() method in the
  +   * OROMatcher<font size="-2"><sup>TM</sup></font> package does NOT
  +   * implement this particular behavior because it is intended to
  +   * be usable with Pattern instances other than Perl5Pattern.
  +   * <p>
  +   * @param results 
  +   *    A <code> List </code> to which the substrings of the input
  +   *    that occur between the regular expression delimiter occurences
  +   *    are appended. The input will not be split into any more substrings
  +   *    than the specified 
  +   *    limit. A way of thinking of this is that only the first
  +   *    <b>limit - 1</b>
  +   *    matches of the delimiting regular expression will be used to split the
  +   *    input. 
  +   * @param pattern The regular expression to use as a split delimiter.
  +   * @param input The String to split.
  +   * @param limit The limit on the size of the returned <code>Vector</code>.
  +   *   Values <= 0 produce the same behavior as the SPLIT_ALL constant which
  +   *   causes the limit to be ignored and splits to be performed on all
  +   *   occurrences of the pattern.  You should use the SPLIT_ALL constant
  +   *   to achieve this behavior instead of relying on the default behavior
  +   *   associated with non-positive limit values.
  +   * @exception MalformedPerl5PatternException  If there is an error in
  +   *            the expression.  You are not forced to catch this exception
  +   *            because it is derived from RuntimeException.
  +   */
  +  public synchronized void split(List results, String pattern,
  +				   String input, int limit)
  +       throws MalformedPerl5PatternException 
  +  {
  +    int beginOffset, groups, index;
  +    String group;
  +    MatchResult currentResult = null;
  +    PatternMatcherInput pinput;
  +    Pattern compiledPattern;
  +
  +    compiledPattern = __parseMatchExpression(pattern);
  +
  +    pinput = new PatternMatcherInput(input);
  +    beginOffset = 0;
  +
  +    while(--limit != 0 && __matcher.contains(pinput, compiledPattern)) {
  +      currentResult = __matcher.getMatch();
  +
  +      results.add(input.substring(beginOffset,
  +				  currentResult.beginOffset(0)));
  +      if((groups = currentResult.groups()) > 1) {
  +	for(index = 1; index < groups; ++index) {
  +	  group = currentResult.group(index);
  +	  if(group != null && group.length() > 0)
  +	    results.add(group);
  +	}
  +      }
  +
  +      beginOffset = currentResult.endOffset(0);
  +    }
  +
  +    results.add(input.substring(beginOffset, input.length()));
  +
  +    // Just for the sake of completeness
  +    __lastMatch = currentResult;
  +  }
  +
  +  /**
  +   * This method is identical to calling:
  +   * <blockquote><pre>
  +   * split(results, pattern, input, SPLIT_ALL);
  +   * </pre></blockquote>
  +   */
  +  public synchronized void split(List results, String pattern, String input)
  +       throws MalformedPerl5PatternException 
  +  {
  +    split(results, pattern, input, SPLIT_ALL);
  +  }
  +
  +  /**
      * Splits a String into strings contained in a Vector of size no greater
      * than a specified limit.  The String is split using a regular expression
      * as the delimiter.  The regular expression is a pattern specified
  @@ -726,6 +836,9 @@
      * implement this particular behavior because it is intended to
      * be usable with Pattern instances other than Perl5Pattern.
      * <p>
  +   * @deprecated Use
  +   *     {@link split(List results, String pattern, String input, int limit)}
  +   *     instead.
      * @param pattern The regular expression to use as a split delimiter.
      * @param input The String to split.
      * @param limit The limit on the size of the returned <code>Vector</code>.
  @@ -789,6 +902,8 @@
      * <blockquote><pre>
      * split(pattern, input, SPLIT_ALL);
      * </pre></blockquote>
  +   * @deprecated Use {@link split(List results, String pattern, String input)}
  +   *             instead.
      */
     public synchronized Vector split(String pattern, String input)
          throws MalformedPerl5PatternException