You are viewing a plain text version of this content. The canonical link for it is here.
Posted to oro-dev@jakarta.apache.org by df...@locus.apache.org on 2000/09/15 07:17:27 UTC
cvs commit: jakarta-oro/src/java/org/apache/oro/text/perl Perl5Util.java
dfs 00/09/14 22:17:27
Modified: src/java/org/apache/oro/text/regex Util.java
src/java/org/apache/oro/text/perl Perl5Util.java
Log:
Changed split method to accept a List argument to which results are
appended rather than create a new Vector each time. This makes
it easier to reduce memory allocation overheads through the reuse of List
instances.
Revision Changes Path
1.3 +117 -20 jakarta-oro/src/java/org/apache/oro/text/regex/Util.java
Index: Util.java
===================================================================
RCS file: /home/cvs/jakarta-oro/src/java/org/apache/oro/text/regex/Util.java,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- Util.java 2000/07/23 23:25:27 1.2
+++ Util.java 2000/09/15 05:17:24 1.3
@@ -85,7 +85,7 @@
* </ol>
@author <a href="mailto:dfs@savarese.org">Daniel F. Savarese</a>
- @version $Id: Util.java,v 1.2 2000/07/23 23:25:27 jon Exp $
+ @version $Id: Util.java,v 1.3 2000/09/15 05:17:24 dfs Exp $
* @see Pattern
* @see PatternMatcher
@@ -111,7 +111,114 @@
*/
private Util() { }
+
+ /**
+ * Splits up a <code>String</code> instance and stores results as a
+ * <code>List</code> of substrings numbering no more than a specified
+ * limit. The string is split with a regular expression as the delimiter.
+ * The <b>limit</b> parameter essentially says to split the
+ * string only on at most the first <b>limit - 1</b> number of pattern
+ * occurences.
+ * <p>
+ * This method is inspired by the Perl split() function and behaves
+ * identically to it when used in conjunction with the Perl5Matcher and
+ * Perl5Pattern classes except for the following difference:
+ * <ul><p>
+ * In Perl, if the split expression contains parentheses, the split()
+ * method creates additional list elements from each of the matching
+ * subgroups in the pattern. In other words:
+ * <ul><p>
+ * <code>split(list, "/([,-])/", "8-12,15,18", Util.SPLIT_ALL)</code></ul>
+ * <p> produces the list containing:
+ * <ul><p><code> { "8", "-", "12", ",", "15", ",", "18" } </code> </ul>
+ * <p> The OROMatcher split method does not follow this behavior. The
+ * following list would be produced by OROMatcher:
+ * <ul><p><code> { "8", "12", "15", "18" } </code> </ul>
+ * <p> To obtain the Perl behavior, use
+ * {@link org.apache.oro.text.perl.Perl5Util#split}.
+ * </ul>
+ * <p>
+ * @param results A List to which the split results are appended.
+ * After the method returns, it contains the substrings of the input
+ * that occur between the regular expression delimiter occurences.
+ * The input will not be split into any more substrings than the
+ * specified <code>limit</code>. A way of thinking of this is that
+ * only the first <code>limit - 1</code> matches of the delimiting
+ * regular expression will be used to split the input.
+ * @param matcher The regular expression matcher to execute the split.
+ * @param pattern The regular expression to use as a split delimiter.
+ * @param input The <code>String</code> to split.
+ * @param limit The limit on the number of resulting split elements.
+ * Values <= 0 produce the same behavior as using the
+ * <b>SPLIT_ALL</b> constant which causes the limit to be
+ * ignored and splits to be performed on all occurrences of
+ * the pattern. You should use the <b>SPLIT_ALL</b> constant
+ * to achieve this behavior instead of relying on the default
+ * behavior associated with non-positive limit values.
+ * @return A <code>Vector</code>
+ */
+ public static void split(List results, PatternMatcher matcher,
+ Pattern pattern, String input, int limit)
+ {
+ int beginOffset;
+ MatchResult currentResult;
+ PatternMatcherInput pinput;
+
+ pinput = new PatternMatcherInput(input);
+ beginOffset = 0;
+
+ while(--limit != 0 && matcher.contains(pinput, pattern)) {
+ currentResult = matcher.getMatch();
+ results.add(input.substring(beginOffset,
+ currentResult.beginOffset(0)));
+ beginOffset = currentResult.endOffset(0);
+ }
+
+ results.add(input.substring(beginOffset, input.length()));
+ }
+
+
/**
+ * Splits up a <code>String</code> instance and stores results as a
+ * <code>List</code> of all its substrings using a regular expression
+ * as the delimiter.
+ * This method is inspired by the Perl split() function and behaves
+ * identically to it when used in conjunction with the Perl5Matcher and
+ * Perl5Pattern classes except for the following difference:
+ * <p>
+ * <ul>
+ * In Perl, if the split expression contains parentheses, the split()
+ * method creates additional list elements from each of the matching
+ * subgroups in the pattern. In other words:
+ * <ul><p><code>split(list, "/([,-])/", "8-12,15,18")</code></ul>
+ * <p> produces the list containing:
+ * <ul><p><code> { "8", "-", "12", ",", "15", ",", "18" } </code> </ul>
+ * <p> The OROMatcher split method does not follow this behavior. The
+ * following list would be produced by OROMatcher:
+ * <ul><p><code> { "8", "12", "15", "18" } </code> </ul>
+ * <p> To obtain the Perl behavior, use
+ * {@link org.apache.oro.text.perl.Perl5Util#split}.
+ * </ul>
+ * <p>
+ * This method is identical to calling:
+ * <blockquote><pre>
+ * split(matcher, pattern, input, Util.SPLIT_ALL);
+ * </pre></blockquote>
+ * <p>
+ * @param results A <code>List</code> to which all the substrings of
+ * the input that occur between the regular expression delimiter
+ * occurences are appended.
+ * @param matcher The regular expression matcher to execute the split.
+ * @param pattern The regular expression to use as a split delimiter.
+ * @param input The <code>String</code> to split.
+ */
+ public static void split(List results, PatternMatcher matcher,
+ Pattern pattern, String input)
+ {
+ split(results, matcher, pattern, input, SPLIT_ALL);
+ }
+
+ /**
* Splits up a <code>String</code> instance into strings contained in a
* <code>Vector</code> of size not greater than a specified limit. The
* string is split with a regular expression as the delimiter.
@@ -132,11 +239,12 @@
* <p> The OROMatcher split method does not follow this behavior. The
* following Vector would be produced by OROMatcher:
* <ul><p><code> { "8", "12", "15", "18" } </code> </ul>
- * <p> To obtain the Perl behavior, use split method in the PerlTools
- * package available from
- * <a href="http://www.oroinc.com/"> http://www.oroinc.com/ </a>.
+ * <p> To obtain the Perl behavior, use
+ * {@link org.apache.oro.text.perl.Perl5Util#split}.
* </ul>
* <p>
+ * @deprecated Use {@link #split(List, PatternMatcher, Pattern, String, int)}
+ * instead.
* @param matcher The regular expression matcher to execute the split.
* @param pattern The regular expression to use as a split delimiter.
* @param input The <code>String</code> to split.
@@ -157,21 +265,9 @@
public static Vector split(PatternMatcher matcher, Pattern pattern,
String input, int limit)
{
- int beginOffset;
Vector results = new Vector(20);
- MatchResult currentResult;
- PatternMatcherInput pinput;
-
- pinput = new PatternMatcherInput(input);
- beginOffset = 0;
- while(--limit != 0 && matcher.contains(pinput, pattern)) {
- currentResult = matcher.getMatch();
- results.addElement(input.substring(beginOffset,
- currentResult.beginOffset(0)));
- beginOffset = currentResult.endOffset(0);
- }
- results.addElement(input.substring(beginOffset, input.length()));
+ split(results, matcher, pattern, input, limit);
return results;
}
@@ -194,9 +290,8 @@
* <p> The OROMatcher split method does not follow this behavior. The
* following Vector would be produced by OROMatcher:
* <ul><p><code> { "8", "12", "15", "18" } </code> </ul>
- * <p> To obtain the Perl behavior, use split method in the PerlTools
- * package available from
- * <a href="http://www.oroinc.com/"> http://www.oroinc.com/ </a>.
+ * <p> To obtain the Perl behavior, use
+ * {@link org.apache.oro.text.perl.Perl5Util#split}.
* </ul>
* <p>
* This method is identical to calling:
@@ -204,6 +299,8 @@
* split(matcher, pattern, input, Util.SPLIT_ALL);
* </pre></blockquote>
* <p>
+ * @deprecated Use {@link #split(List, PatternMatcher, Pattern, String)}
+ * instead.
* @param matcher The regular expression matcher to execute the split.
* @param pattern The regular expression to use as a split delimiter.
* @param input The <code>String</code> to split.
1.3 +116 -1 jakarta-oro/src/java/org/apache/oro/text/perl/Perl5Util.java
Index: Perl5Util.java
===================================================================
RCS file: /home/cvs/jakarta-oro/src/java/org/apache/oro/text/perl/Perl5Util.java,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- Perl5Util.java 2000/07/23 23:25:23 1.2
+++ Perl5Util.java 2000/09/15 05:17:26 1.3
@@ -148,7 +148,7 @@
* (or even a split, but this isn't particularly useful).
@author <a href="mailto:dfs@savarese.org">Daniel F. Savarese</a>
- @version $Id: Perl5Util.java,v 1.2 2000/07/23 23:25:23 jon Exp $
+ @version $Id: Perl5Util.java,v 1.3 2000/09/15 05:17:26 dfs Exp $
* @see MalformedPerl5PatternException
* @see org.apache.oro.text.PatternCache
@@ -687,6 +687,116 @@
/**
+ * Splits a String into strings that are appended to a List, but no more
+ * than a specified limit. The String is split using a regular expression
+ * as the delimiter. The regular expression is a pattern specified
+ * in Perl5 native format:
+ * <blockquote><pre>
+ * [m]/pattern/[i][m][s][x]
+ * </pre></blockquote>
+ * The <code>m</code> prefix is optional and the meaning of the optional
+ * trailing options are:
+ * <dl compact>
+ * <dt> i <dd> case insensitive match
+ * <dt> m <dd> treat the input as consisting of multiple lines
+ * <dt> s <dd> treat the input as consisting of a single line
+ * <dt> x <dd> enable extended expression syntax incorporating whitespace
+ * and comments
+ * </dl>
+ * As with Perl, any non-alphanumeric character can be used in lieu of
+ * the slashes.
+ * <p>
+ * The limit parameter causes the string to be split on at most the first
+ * <b>limit - 1</b> number of pattern occurences.
+ * <p>
+ * Of special note is that this split method performs EXACTLY the same
+ * as the Perl split() function. In other words, if the split pattern
+ * contains parentheses, additional Vector elements are created from
+ * each of the matching subgroups in the pattern. Using an example
+ * similar to the one from the Camel book:
+ * <blockquote><pre>
+ * split(list, "/([,-])/", "8-12,15,18")
+ * </pre></blockquote>
+ * produces the Vector containing:
+ * <blockquote><pre>
+ * { "8", "-", "12", ",", "15", ",", "18" }
+ * </pre></blockquote>
+ * The Util.split() method in the
+ * OROMatcher<font size="-2"><sup>TM</sup></font> package does NOT
+ * implement this particular behavior because it is intended to
+ * be usable with Pattern instances other than Perl5Pattern.
+ * <p>
+ * @param results
+ * A <code> List </code> to which the substrings of the input
+ * that occur between the regular expression delimiter occurences
+ * are appended. The input will not be split into any more substrings
+ * than the specified
+ * limit. A way of thinking of this is that only the first
+ * <b>limit - 1</b>
+ * matches of the delimiting regular expression will be used to split the
+ * input.
+ * @param pattern The regular expression to use as a split delimiter.
+ * @param input The String to split.
+ * @param limit The limit on the size of the returned <code>Vector</code>.
+ * Values <= 0 produce the same behavior as the SPLIT_ALL constant which
+ * causes the limit to be ignored and splits to be performed on all
+ * occurrences of the pattern. You should use the SPLIT_ALL constant
+ * to achieve this behavior instead of relying on the default behavior
+ * associated with non-positive limit values.
+ * @exception MalformedPerl5PatternException If there is an error in
+ * the expression. You are not forced to catch this exception
+ * because it is derived from RuntimeException.
+ */
+ public synchronized void split(List results, String pattern,
+ String input, int limit)
+ throws MalformedPerl5PatternException
+ {
+ int beginOffset, groups, index;
+ String group;
+ MatchResult currentResult = null;
+ PatternMatcherInput pinput;
+ Pattern compiledPattern;
+
+ compiledPattern = __parseMatchExpression(pattern);
+
+ pinput = new PatternMatcherInput(input);
+ beginOffset = 0;
+
+ while(--limit != 0 && __matcher.contains(pinput, compiledPattern)) {
+ currentResult = __matcher.getMatch();
+
+ results.add(input.substring(beginOffset,
+ currentResult.beginOffset(0)));
+ if((groups = currentResult.groups()) > 1) {
+ for(index = 1; index < groups; ++index) {
+ group = currentResult.group(index);
+ if(group != null && group.length() > 0)
+ results.add(group);
+ }
+ }
+
+ beginOffset = currentResult.endOffset(0);
+ }
+
+ results.add(input.substring(beginOffset, input.length()));
+
+ // Just for the sake of completeness
+ __lastMatch = currentResult;
+ }
+
+ /**
+ * This method is identical to calling:
+ * <blockquote><pre>
+ * split(results, pattern, input, SPLIT_ALL);
+ * </pre></blockquote>
+ */
+ public synchronized void split(List results, String pattern, String input)
+ throws MalformedPerl5PatternException
+ {
+ split(results, pattern, input, SPLIT_ALL);
+ }
+
+ /**
* Splits a String into strings contained in a Vector of size no greater
* than a specified limit. The String is split using a regular expression
* as the delimiter. The regular expression is a pattern specified
@@ -726,6 +836,9 @@
* implement this particular behavior because it is intended to
* be usable with Pattern instances other than Perl5Pattern.
* <p>
+ * @deprecated Use
+ * {@link split(List results, String pattern, String input, int limit)}
+ * instead.
* @param pattern The regular expression to use as a split delimiter.
* @param input The String to split.
* @param limit The limit on the size of the returned <code>Vector</code>.
@@ -789,6 +902,8 @@
* <blockquote><pre>
* split(pattern, input, SPLIT_ALL);
* </pre></blockquote>
+ * @deprecated Use {@link split(List results, String pattern, String input)}
+ * instead.
*/
public synchronized Vector split(String pattern, String input)
throws MalformedPerl5PatternException