You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@harmony.apache.org by te...@apache.org on 2006/03/29 13:52:15 UTC
svn commit: r389769 [1/2] - in
/incubator/harmony/enhanced/classlib/trunk/modules/regex-beans-math:
src/common/javasrc/java/util/regex/Pattern.java
test/common/unit/java/util/regex/PatternTest.java
Author: tellison
Date: Wed Mar 29 03:52:12 2006
New Revision: 389769
URL: http://svn.apache.org/viewcvs?rev=389769&view=rev
Log:
Fix for HARMONY-271 (java.util.regex.Pattern.split() method discards last trailing empty string)
Modified:
incubator/harmony/enhanced/classlib/trunk/modules/regex-beans-math/src/common/javasrc/java/util/regex/Pattern.java
incubator/harmony/enhanced/classlib/trunk/modules/regex-beans-math/test/common/unit/java/util/regex/PatternTest.java
Modified: incubator/harmony/enhanced/classlib/trunk/modules/regex-beans-math/src/common/javasrc/java/util/regex/Pattern.java
URL: http://svn.apache.org/viewcvs/incubator/harmony/enhanced/classlib/trunk/modules/regex-beans-math/src/common/javasrc/java/util/regex/Pattern.java?rev=389769&r1=389768&r2=389769&view=diff
==============================================================================
--- incubator/harmony/enhanced/classlib/trunk/modules/regex-beans-math/src/common/javasrc/java/util/regex/Pattern.java (original)
+++ incubator/harmony/enhanced/classlib/trunk/modules/regex-beans-math/src/common/javasrc/java/util/regex/Pattern.java Wed Mar 29 03:52:12 2006
@@ -1,993 +1,997 @@
-/*
- * Copyright 2005 The Apache Software Foundation or its licensors, as applicable.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * @author Nikolay A. Kuznetsov
- * @version $Revision: 1.36.2.2 $
- */
-package java.util.regex;
-
-import java.io.Serializable;
-
-import java.util.ArrayList;
-
-/**
- * @com.intel.drl.spec_ref
- *
- * @author Nikolay A. Kuznetsov
- * @version $Revision: 1.36.2.2 $
- */
-public final class Pattern implements Serializable {
-
- private static final long serialVersionUID = 5073258162644648461L;
-
- static final boolean _DEBUG_ = false;
-
- /**
- * @com.intel.drl.spec_ref
- */
- public static final int UNIX_LINES = 1 << 0;
-
- /**
- * @com.intel.drl.spec_ref
- */
- public static final int CASE_INSENSITIVE = 1 << 1;
-
- /**
- * @com.intel.drl.spec_ref
- */
- public static final int COMMENTS = 1 << 2;
-
- /**
- * @com.intel.drl.spec_ref
- */
- public static final int MULTILINE = 1 << 3;
-
- /**
- * @com.intel.drl.spec_ref
- */
- public static final int LITERAL = 1 << 4;
-
- /**
- * @com.intel.drl.spec_ref
- */
- public static final int DOTALL = 1 << 5;
-
- /**
- * @com.intel.drl.spec_ref
- */
- public static final int UNICODE_CASE = 1 << 6;
-
- /**
- * @com.intel.drl.spec_ref
- */
- public static final int CANON_EQ = 1 << 7;
-
- /**
- * Current <code>pattern</code> to be compiled;
- */
- private Lexer lexemes = null;
-
- /**
- * Pattern compile flags;
- */
- private int flags = 0;
-
- private String pattern = null;
-
- transient private int groupIndex = -1;
-
- transient private int globalGroupIndex = -1;
-
- transient private int compCount = -1;
-
- transient private int consCount = -1;
-
- transient AbstractSet start = null;
-
- /**
- * @com.intel.drl.spec_ref
- */
- public Matcher matcher(CharSequence cs) {
- return new Matcher(this, cs);
- }
-
- /**
- * @com.intel.drl.spec_ref
- */
- public String[] split(CharSequence input, int limit) {
- ArrayList res = new ArrayList();
- Matcher mat = matcher(input);
- int index = 0;
- int curPos = 0;
- while (mat.find() && (index + 1 < limit || limit <= 0)) {
- res.add(input.subSequence(curPos, mat.start()).toString());
- curPos = mat.end();
- index++;
- }
-
- if (curPos < input.length()) {
- res.add(input.subSequence(curPos, input.length()).toString());
- index++;
- }
-
- // discard trailing empty strings
- // //
- if (limit == 0)
- while (--index >= 0 && res.get(index).toString().length() == 0) {
- res.remove(index);
- }
-
- return (String[]) res.toArray(new String[index >= 0 ? index : 0]);
- }
-
- /**
- * @com.intel.drl.spec_ref
- */
- public String[] split(CharSequence input) {
- return split(input, 0);
- }
-
- /**
- * @com.intel.drl.spec_ref
- */
- public String pattern() {
- return lexemes.toString();
- }
-
- /**
- * @com.intel.drl.spec_ref
- */
- public String toString() {
- return this.pattern();
- }
-
- /**
- * @com.intel.drl.spec_ref
- */
- public int flags() {
- return this.flags;
- }
-
- /**
- * @com.intel.drl.spec_ref
- */
- public static Pattern compile(String regex, int flags)
- throws PatternSyntaxException {
- AbstractSet.counter = 1;
-
- return new Pattern().compileImpl(regex, flags);
- }
-
- /**
- *
- * @param pattern -
- * Regular expression to be compiled
- * @param flags -
- * The bit mask including CASE_INSENSITIVE, MULTILINE, DOTALL,
- * UNICODE_CASE, and CANON_EQ
- *
- * @return Compiled pattern
- */
- private Pattern compileImpl(String regex, int flags)
- throws PatternSyntaxException {
- this.lexemes = new Lexer(regex, flags);
- this.flags = flags;
-
- start = processExpression(-1, flags, null);
- if (!lexemes.isEmpty()) {
- throw new PatternSyntaxException(I18n
- .getMessage("Trailing characters"), lexemes.toString(),
- lexemes.getIndex());
- }
-
- return this;
- }
-
- /**
- * A->(a|)+
- */
- private AbstractSet processAlternations(AbstractSet last) {
- CharClass auxRange = new CharClass(hasFlag(Pattern.CASE_INSENSITIVE),
- hasFlag(Pattern.UNICODE_CASE));
- while (!lexemes.isEmpty()
- && lexemes.isLetter()
- && (lexemes.lookAhead() == 0
- || lexemes.lookAhead() == Lexer.CHAR_VERTICAL_BAR || lexemes
- .lookAhead() == Lexer.CHAR_RIGHT_PARENTHESIS)) {
- auxRange.add(lexemes.next());
- if (lexemes.peek() == Lexer.CHAR_VERTICAL_BAR)
- lexemes.next();
- }
-
- if (!auxRange.hasUCI()) {
- return new RangeSet(auxRange, last);
- } else {
- return new UCIRangeSet(auxRange, last);
- }
- }
-
- /**
- * E->AE; E->S|E; E->S; A->(a|)+ E->S(|S)*
- */
- private AbstractSet processExpression(int ch, int new_flags,
- AbstractSet last) {
- ArrayList children = new ArrayList();
- AbstractSet child;
- int safe_flags = flags;
- FSet fSet;
-
- if (new_flags != flags) {
- flags = new_flags;
- lexemes.setFlags(flags);
- }
-
- switch (ch) {
- case Lexer.CHAR_NONCAP_GROUP: {
- fSet = new NonCapFSet(++consCount);
- break;
- }
-
- case Lexer.CHAR_POS_LOOKAHEAD:
- case Lexer.CHAR_NEG_LOOKAHEAD: {
- fSet = new AheadFSet();
- break;
- }
- case Lexer.CHAR_POS_LOOKBEHIND:
- case Lexer.CHAR_NEG_LOOKBEHIND: {
- fSet = new BehindFSet(++consCount);
- break;
- }
-
- case Lexer.CHAR_ATOMIC_GROUP: {
- fSet = new AtomicFSet(++consCount);
- break;
- }
-
- default: {
- globalGroupIndex++;
- if (last == null) {
- fSet = new FinalSet();
- // expr = new StartSet();
- } else {
- fSet = new FSet(globalGroupIndex);
- // expr = new JointSet(globalGroupIndex);
- }
- }
- }
-
- do {
- if (lexemes.isLetter()
- && lexemes.lookAhead() == Lexer.CHAR_VERTICAL_BAR) {
- child = processAlternations(fSet);
- } else {
- child = processSubExpression(fSet);
- if (lexemes.peek() == Lexer.CHAR_VERTICAL_BAR)
- lexemes.next();
- }
- if (child != null)
- children.add(child);
- // expr.addChild(child);
- } while (!(lexemes.isEmpty() || lexemes.peek() == Lexer.CHAR_RIGHT_PARENTHESIS));
-
- if (flags != safe_flags) {
- flags = safe_flags;
- lexemes.setFlags(flags);
- }
-
- switch (ch) {
- case Lexer.CHAR_NONCAP_GROUP: {
- return new NonCapJointSet(children, fSet);
- }
- case Lexer.CHAR_POS_LOOKAHEAD: {
- return new PositiveLookAhead(children, fSet);
- }
-
- case Lexer.CHAR_NEG_LOOKAHEAD: {
- return new NegativeLookAhead(children, fSet);
- }
-
- case Lexer.CHAR_POS_LOOKBEHIND: {
- return new PositiveLookBehind(children, fSet);
- }
-
- case Lexer.CHAR_NEG_LOOKBEHIND: {
- return new NegativeLookBehind(children, fSet);
- }
-
- case Lexer.CHAR_ATOMIC_GROUP: {
- return new AtomicJointSet(children, fSet);
- }
- default: {
- switch (children.size()) {
- case 0:
- return new EmptySet(fSet);
- case 1:
- return new SingleSet((AbstractSet) children.get(0), fSet);
- default:
- return new JointSet(children, fSet);
- }
- }
- }
- }
-
- /**
- * T->a+
- */
- private AbstractSet processSequence(AbstractSet last) {
- StringBuffer substring = new StringBuffer();
- while (!lexemes.isEmpty()
- && lexemes.isLetter()
- && ((!lexemes.isNextSpecial() && lexemes.lookAhead() == 0) // end
- // of
- // pattern
- || (!lexemes.isNextSpecial() && Lexer.isLetter(lexemes
- .lookAhead()))
- || lexemes.lookAhead() == Lexer.CHAR_RIGHT_PARENTHESIS
- || (lexemes.lookAhead() & 0x8000ffff) == Lexer.CHAR_LEFT_PARENTHESIS
- || lexemes.lookAhead() == Lexer.CHAR_VERTICAL_BAR || lexemes
- .lookAhead() == Lexer.CHAR_DOLLAR)) {
- substring.append((char) lexemes.next());
- }
- if (!hasFlag(Pattern.CASE_INSENSITIVE)) {
- return new SequenceSet(substring);
- } else if (!hasFlag(Pattern.UNICODE_CASE)) {
- return new CISequenceSet(substring);
- } else {
- return new UCISequenceSet(substring);
- }
- }
-
- /**
- * S->BS; S->QS; S->Q; B->a+
- */
- private AbstractSet processSubExpression(AbstractSet last) {
- AbstractSet cur;
- if (lexemes.isLetter() && !lexemes.isNextSpecial()
- && Lexer.isLetter(lexemes.lookAhead())) {
- cur = processSequence(last);
- } else {
- cur = processQuantifier(last);
- }
-
- if (!lexemes.isEmpty()
- // && !pattern.isQuantifier()
- && lexemes.peek() != Lexer.CHAR_RIGHT_PARENTHESIS
- && lexemes.peek() != Lexer.CHAR_VERTICAL_BAR) {
- AbstractSet next = processSubExpression(last);
- if (cur instanceof LeafQuantifierSet
- // TODO create personal UnifiedQuantifierSet for composite
- // quantifiers
- // to take into account Quantifier counters
- // ////
- && !(cur instanceof CompositeQuantifierSet)
- && !(cur instanceof GroupQuantifierSet)
- && !next.first(((LeafQuantifierSet) cur).getInnerSet())) {
- cur = new UnifiedQuantifierSet((LeafQuantifierSet) cur);
- }
- if (((char) next.getType()) == '+') {
- cur.setNext(((LeafQuantifierSet) next).getInnerSet());
- } else {
- cur.setNext(next);
- }
- } else if (cur != null) {
- cur.setNext(last);
- } else {
- return null;
- }
-
- if (((char) cur.getType()) == '+') {
- return ((QuantifierSet) cur).getInnerSet();
- } else {
- return cur;
- }
- }
-
- /**
- * Q->T(*|+|?...) also do some optimizations.
- *
- */
- private AbstractSet processQuantifier(AbstractSet last) {
- AbstractSet term = processTerminal(last);
- SpecialToken quantifier = null;
-
- int quant = lexemes.peek();
-
- if (term != null && !(term instanceof LeafSet)) {
- switch (quant) {
- case Lexer.QUANT_STAR:
- case Lexer.QUANT_PLUS: {
- lexemes.next();
- GroupQuantifierSet q = new GroupQuantifierSet(term, last, quant);
- term.setNext(q);
- return q;
- }
-
- case Lexer.QUANT_STAR_R:
- case Lexer.QUANT_PLUS_R: {
- lexemes.next();
- GroupQuantifierSet q = new ReluctantGroupQuantifierSet(term,
- last, quant);
- term.setNext(q);
- return q;
- }
-
- case Lexer.QUANT_PLUS_P: {
- lexemes.next();
- // possessive plus will be handled by unique class
- // and should not be postprocessed to point previous set
- // to the inner one.
- // //
- return new PosPlusGroupQuantifierSet(term, last,
- Lexer.QUANT_STAR_P);
- }
-
- case Lexer.QUANT_STAR_P: {
- lexemes.next();
- return new PossessiveGroupQuantifierSet(term, last, quant);
- }
-
- case Lexer.QUANT_ALT: {
- lexemes.next();
- AltGroupQuantifierSet q = new AltGroupQuantifierSet(term, last,
- Lexer.QUANT_ALT);
- term.setNext(last);
- return q;
- }
-
- case Lexer.QUANT_ALT_P: {
- lexemes.next();
- return new PosAltGroupQuantifierSet(term, last, Lexer.QUANT_ALT);
- }
-
- case Lexer.QUANT_ALT_R: {
- lexemes.next();
- RelAltGroupQuantifierSet q = new RelAltGroupQuantifierSet(term,
- last, Lexer.QUANT_ALT);
- term.setNext(last);
- return q;
- }
-
- case Lexer.QUANT_COMP: {
- CompositeGroupQuantifierSet q = new CompositeGroupQuantifierSet(
- (Quantifier) lexemes.nextSpecial(), term, last,
- Lexer.QUANT_ALT, ++compCount);
- term.setNext(q);
- return q;
- }
-
- case Lexer.QUANT_COMP_P: {
- return new PosCompositeGroupQuantifierSet((Quantifier) lexemes
- .nextSpecial(), term, last, Lexer.QUANT_ALT,
- ++compCount);
- }
-
- case Lexer.QUANT_COMP_R: {
- RelCompositeGroupQuantifierSet q = new RelCompositeGroupQuantifierSet(
- (Quantifier) lexemes.nextSpecial(), term, last,
- Lexer.QUANT_ALT, ++compCount);
- term.setNext(q);
- return q;
- }
-
- default:
- return term;
- }
- } else {
- LeafSet leaf = null;
- if (term != null)
- leaf = (LeafSet) term;
- switch (quant) {
- case Lexer.QUANT_STAR:
- case Lexer.QUANT_PLUS: {
- lexemes.next();
- LeafQuantifierSet q;
- if (term.getType() == AbstractSet.TYPE_DOTSET) {
- if (!hasFlag(Pattern.DOTALL)) {
- q = new DotQuantifierSet(leaf, last, quant,
- AbstractLineTerminator.getInstance(flags));
- } else {
- q = new DotAllQuantifierSet(leaf, last, quant);
- }
- } else {
- q = new LeafQuantifierSet(leaf, last, quant);
- }
- leaf.setNext(q);
- return q;
- }
-
- case Lexer.QUANT_STAR_R:
- case Lexer.QUANT_PLUS_R: {
- lexemes.next();
- ReluctantQuantifierSet q = new ReluctantQuantifierSet(leaf,
- last, quant);
- leaf.setNext(q);
- return q;
- }
-
- case Lexer.QUANT_PLUS_P:
- case Lexer.QUANT_STAR_P: {
- lexemes.next();
- PossessiveQuantifierSet q = new PossessiveQuantifierSet(leaf,
- last, quant);
- leaf.setNext(q);
- return q;
- }
-
- case Lexer.QUANT_ALT: {
- lexemes.next();
- return new AltQuantifierSet(leaf, last, Lexer.QUANT_ALT);
- }
-
- case Lexer.QUANT_ALT_P: {
- lexemes.next();
- return new PossessiveAltQuantifierSet(leaf, last,
- Lexer.QUANT_ALT_P);
- }
-
- case Lexer.QUANT_ALT_R: {
- lexemes.next();
- return new ReluctantAltQuantifierSet(leaf, last,
- Lexer.QUANT_ALT_R);
- }
-
- case Lexer.QUANT_COMP: {
- return new CompositeQuantifierSet((Quantifier) lexemes
- .nextSpecial(), leaf, last, Lexer.QUANT_COMP);
- }
-
- case Lexer.QUANT_COMP_P: {
- return new PossessiveCompositeQuantifierSet(
- (Quantifier) lexemes.nextSpecial(), leaf, last,
- Lexer.QUANT_COMP_P);
- }
- case Lexer.QUANT_COMP_R: {
- return new ReluctantCompositeQuantifierSet((Quantifier) lexemes
- .nextSpecial(), leaf, last, Lexer.QUANT_COMP_R);
- }
-
- default:
- return term;
- }
- }
- }
-
- /**
- * T-> letter|[range]|{char-class}|(E)
- */
- private AbstractSet processTerminal(AbstractSet last) {
- int ch;
- AbstractSet term = null;
- do {
- ch = lexemes.peek();
- if ((ch & 0x8000ffff) == Lexer.CHAR_LEFT_PARENTHESIS) {
- lexemes.next();
- int new_flags = (ch & 0x00ff0000) >> 16;
- ch = ch & 0xff00ffff;
-
- if (ch == Lexer.CHAR_FLAGS) {
- flags = new_flags;
- lexemes.setFlags(new_flags);
- } else {
- new_flags = (ch == Lexer.CHAR_NONCAP_GROUP) ? new_flags
- : flags;
- term = processExpression(ch, new_flags, last);
- if (lexemes.peek() != Lexer.CHAR_RIGHT_PARENTHESIS)
- throw new PatternSyntaxException(I18n
- .getMessage("unmatched ("), lexemes.toString(),
- lexemes.getIndex());
- lexemes.next();
- }
-
- } else
- switch (ch) {
- case Lexer.CHAR_LEFT_SQUARE_BRACKET: {
- lexemes.next();
- boolean negative = false;
- if (lexemes.peek() == Lexer.CHAR_CARET) {
- negative = true;
- lexemes.next();
- }
-
- term = processRange(negative, last);
- if (lexemes.peek() != Lexer.CHAR_RIGHT_SQUARE_BRACKET)
- throw new PatternSyntaxException(I18n
- .getMessage("unmatched ["), lexemes.toString(),
- lexemes.getIndex());
- lexemes.setMode(Lexer.MODE_PATTERN);
- lexemes.next();
- break;
- }
-
- case Lexer.CHAR_DOT: {
- lexemes.next();
-
- if (!hasFlag(Pattern.DOTALL)) {
- term = new DotSet(AbstractLineTerminator
- .getInstance(flags));
- } else {
- term = new DotAllSet();
- }
-
- break;
- }
-
- case Lexer.CHAR_CARET: {
- lexemes.next();
- consCount++;
- if (!hasFlag(Pattern.MULTILINE)) {
- term = new SOLSet();
- } else {
- term = new MultiLineSOLSet(AbstractLineTerminator
- .getInstance(flags));
- }
-
- break;
- }
-
- case Lexer.CHAR_DOLLAR: {
- lexemes.next();
- consCount++;
- if (!hasFlag(Pattern.MULTILINE)) {
- if (!hasFlag(Pattern.UNIX_LINES)) {
- term = new EOLSet(consCount);
- } else {
- term = new UEOLSet(consCount);
- }
- } else {
- if (!hasFlag(Pattern.UNIX_LINES)) {
- term = new MultiLineEOLSet(consCount);
- } else {
- term = new UMultiLineEOLSet(consCount);
- }
- }
-
- break;
- }
-
- case Lexer.CHAR_WORD_BOUND: {
- lexemes.next();
- term = new WordBoundary(true);
- break;
- }
-
- case Lexer.CHAR_NONWORD_BOUND: {
- lexemes.next();
- term = new WordBoundary(false);
- break;
- }
-
- case Lexer.CHAR_END_OF_INPUT: {
- lexemes.next();
- term = new EOISet();
- break;
- }
-
- case Lexer.CHAR_END_OF_LINE: {
- lexemes.next();
- term = new EOLSet(++consCount);
- break;
- }
-
- case Lexer.CHAR_START_OF_INPUT: {
- lexemes.next();
- term = new SOLSet();
- break;
- }
-
- case Lexer.CHAR_PREVIOUS_MATCH: {
- lexemes.next();
- term = new PreviousMatch();
- break;
- }
-
- case 0x80000000 | '1':
- case 0x80000000 | '2':
- case 0x80000000 | '3':
- case 0x80000000 | '4':
- case 0x80000000 | '5':
- case 0x80000000 | '6':
- case 0x80000000 | '7':
- case 0x80000000 | '8':
- case 0x80000000 | '9': {
- int number = (ch & 0x7FFFFFFF) - '0';
- if (globalGroupIndex >= number) {
- lexemes.next();
- consCount++;
- if (!hasFlag(Pattern.CASE_INSENSITIVE)) {
- term = new BackReferenceSet(number, consCount);
- } else if (!hasFlag(Pattern.UNICODE_CASE)) {
- term = new CIBackReferenceSet(number, consCount);
- } else {
- term = new UCIBackReferenceSet(number, consCount);
- }
- break;
- } else {
- throw new PatternSyntaxException(I18n
- .getMessage("No such group yet exists at "
- + "this point in the pattern"), lexemes
- .toString(), lexemes.getIndex());
- }
- }
-
- case 0: {
- AbstractCharClass cc = null;
- if ((cc = (AbstractCharClass) lexemes.peekSpecial()) != null) {
- term = new RangeSet(cc);
- } else if (!lexemes.isEmpty()) {
- term = new CharSet((char) ch);
- }
- lexemes.next();
- break;
- }
-
- default: {
- if (ch >= 0 && !lexemes.isSpecial()) {
- if (hasFlag(Pattern.CASE_INSENSITIVE)) {
- if ((ch >= 'a' && ch <= 'z')
- || (ch >= 'A' && ch <= 'Z')) {
- term = new CICharSet((char) ch);
- } else if (hasFlag(Pattern.UNICODE_CASE)
- && ch > 128) {
- term = new UCICharSet((char) ch);
- } else {
- term = new CharSet((char) ch);
- }
- } else {
- term = new CharSet((char) ch);
- }
- lexemes.next();
- } else {
- throw new PatternSyntaxException(I18n
- .getMessage("Dangling meta construction")
- + ": "
- + (lexemes.isSpecial() ? lexemes.peekSpecial()
- .toString() : Character
- .toString((char) ch)), lexemes
- .toString(), lexemes.getIndex());
- }
- }
- }
- } while (ch == Lexer.CHAR_FLAGS);
- return term;
- }
-
- private AbstractSet processRange(boolean negative, AbstractSet last) {
- AbstractCharClass res = processRangeExpression(negative);
- if (!res.hasUCI()) {
- return new RangeSet(res, last);
- } else {
- return new UCIRangeSet(res, last);
- }
- }
-
- /**
- * proceess [...] ranges
- */
- private AbstractCharClass processRangeExpression(boolean alt) {
- CharClass res = new CharClass(alt, hasFlag(Pattern.CASE_INSENSITIVE),
- hasFlag(Pattern.UNICODE_CASE));
- int buffer = -1;
- // TODO remove this one, being used for debug only
- int ch = 0;
- boolean intersection = false;
- boolean notClosed = false;
- boolean firstInClass = true;
-
- while (!lexemes.isEmpty()
- && (notClosed = (ch = lexemes.peek()) != Lexer.CHAR_RIGHT_SQUARE_BRACKET
- || firstInClass)) {
- switch (lexemes.peek()) {
-
- case Lexer.CHAR_RIGHT_SQUARE_BRACKET: {
- if (buffer >= 0)
- res.add(buffer);
- buffer = ']';
- lexemes.next();
- break;
- }
- case Lexer.CHAR_LEFT_SQUARE_BRACKET: {
- lexemes.next();
- boolean negative = false;
- if (lexemes.peek() == Lexer.CHAR_CARET) {
- lexemes.next();
- negative = true;
- }
-
- if (intersection)
- res.intersection(processRangeExpression(negative));
- else
- res.union(processRangeExpression(negative));
- intersection = false;
- lexemes.next();
- break;
- }
-
- case Lexer.CHAR_AMPERSAND: {
- if (buffer >= 0)
- res.add(buffer);
- buffer = lexemes.next();
- // if there is a start for subrange we will do an intersection
- // otherwise treat '&' as normal character
- if (lexemes.peek() == Lexer.CHAR_AMPERSAND
- && lexemes.lookAhead() == Lexer.CHAR_LEFT_SQUARE_BRACKET) {
- lexemes.next();
- intersection = true;
- buffer = -1;
- }
-
- break;
- }
-
- case Lexer.CHAR_HYPHEN: {
- if (firstInClass
- || lexemes.lookAhead() == Lexer.CHAR_RIGHT_SQUARE_BRACKET
- || lexemes.lookAhead() == Lexer.CHAR_LEFT_SQUARE_BRACKET
- || buffer < 0) {
- // treat hypen as normal character
- if (buffer >= 0)
- res.add(buffer);
- buffer = '-';
- lexemes.next();
- // range
- } else {
- lexemes.next();
- int cur = lexemes.peek();
-
- if (!lexemes.isSpecial()
- && (cur >= 0
- || lexemes.lookAhead() == Lexer.CHAR_RIGHT_SQUARE_BRACKET
- || lexemes.lookAhead() == Lexer.CHAR_LEFT_SQUARE_BRACKET || buffer < 0)) {
-
- try {
- res.add(buffer, (char) lexemes.peek());
- } catch (Exception e) {
- throw new PatternSyntaxException(I18n
- .getMessage("Illegal character range"),
- pattern(), lexemes.getIndex());
- }
- lexemes.next();
- buffer = -1;
- } else {
- throw new PatternSyntaxException(I18n
- .getMessage("Illegal character range"),
- pattern(), lexemes.getIndex());
- }
- }
-
- break;
- }
-
- case 0: {
- if (buffer >= 0)
- res.add(buffer);
- AbstractCharClass cs = (AbstractCharClass) lexemes
- .peekSpecial();
- if (cs != null) {
- res.add(cs);
- buffer = -1;
- } else {
- buffer = 0;
- }
-
- lexemes.next();
- break;
- }
-
- default: {
- if (buffer >= 0)
- res.add(buffer);
- buffer = lexemes.next();
- break;
- }
- }
-
- firstInClass = false;
- }
- if (notClosed) {
- throw new PatternSyntaxException(I18n.getMessage("Missing ']'"),
- pattern(), lexemes.getIndex() - 1);
- }
- if (buffer >= 0)
- res.add(buffer);
- return res;
- }
-
- /**
- * @com.intel.drl.spec_ref
- */
- public static Pattern compile(String pattern) {
- return compile(pattern, 0);
- }
-
- /**
- * @com.intel.drl.spec_ref
- */
- public static boolean matches(String regex, CharSequence input) {
- return Pattern.compile(regex).matcher(input).matches();
- }
-
- public static String quote(String s) {
- StringBuffer sb = new StringBuffer().append("\\Q");
- int apos = 0;
- int k;
- while ((k = s.indexOf("\\E", apos)) >= 0) {
- sb.append(s.substring(apos, k + 2)).append("\\\\E\\Q");
- apos = k + 2;
- }
-
- return sb.append(s.substring(apos)).append("\\E").toString();
- }
-
- /**
- * return number of groups found at compile time
- */
- int groupCount() {
- return globalGroupIndex;
- }
-
- int compCount() {
- return this.compCount + 1;
- }
-
- int consCount() {
- return this.consCount + 1;
- }
-
- /**
- * Returns supplementary character. At this time only for ASCII chars.
- */
- static char getSupplement(char ch) {
- char res = ch;
- if (ch >= 'a' && ch <= 'z') {
- res -= 32;
- } else if (ch >= 'A' && ch <= 'Z') {
- res += 32;
- }
-
- return res;
- }
-
- /**
- * @return true if pattern has specified flag
- */
- private boolean hasFlag(int flag) {
- return (flags & flag) == flag;
- }
-
- /**
- * Dismiss public costructor.
- *
- */
- private Pattern() {
- }
-
- /**
- * Serialization support
- */
- private void readObject(java.io.ObjectInputStream s)
- throws java.io.IOException, ClassNotFoundException {
- s.defaultReadObject();
- AbstractSet.counter = 1;
- groupIndex = -1;
- globalGroupIndex = -1;
- compCount = -1;
- consCount = -1;
-
- compileImpl(pattern, flags);
-
- }
-}
+/*
+ * Copyright 2005 The Apache Software Foundation or its licensors, as applicable.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @author Nikolay A. Kuznetsov
+ * @version $Revision: 1.36.2.2 $
+ */
+package java.util.regex;
+
+import java.io.Serializable;
+
+import java.util.ArrayList;
+
+/**
+ * @com.intel.drl.spec_ref
+ *
+ * @author Nikolay A. Kuznetsov
+ * @version $Revision: 1.36.2.2 $
+ */
+public final class Pattern implements Serializable {
+
+ private static final long serialVersionUID = 5073258162644648461L;
+
+ static final boolean _DEBUG_ = false;
+
+ /**
+ * @com.intel.drl.spec_ref
+ */
+ public static final int UNIX_LINES = 1 << 0;
+
+ /**
+ * @com.intel.drl.spec_ref
+ */
+ public static final int CASE_INSENSITIVE = 1 << 1;
+
+ /**
+ * @com.intel.drl.spec_ref
+ */
+ public static final int COMMENTS = 1 << 2;
+
+ /**
+ * @com.intel.drl.spec_ref
+ */
+ public static final int MULTILINE = 1 << 3;
+
+ /**
+ * @com.intel.drl.spec_ref
+ */
+ public static final int LITERAL = 1 << 4;
+
+ /**
+ * @com.intel.drl.spec_ref
+ */
+ public static final int DOTALL = 1 << 5;
+
+ /**
+ * @com.intel.drl.spec_ref
+ */
+ public static final int UNICODE_CASE = 1 << 6;
+
+ /**
+ * @com.intel.drl.spec_ref
+ */
+ public static final int CANON_EQ = 1 << 7;
+
+ /**
+ * Current <code>pattern</code> to be compiled;
+ */
+ private Lexer lexemes = null;
+
+ /**
+ * Pattern compile flags;
+ */
+ private int flags = 0;
+
+ private String pattern = null;
+
+ transient private int groupIndex = -1;
+
+ transient private int globalGroupIndex = -1;
+
+ transient private int compCount = -1;
+
+ transient private int consCount = -1;
+
+ transient AbstractSet start = null;
+
+ /**
+ * @com.intel.drl.spec_ref
+ */
+ public Matcher matcher(CharSequence cs) {
+ return new Matcher(this, cs);
+ }
+
+ /**
+ * @com.intel.drl.spec_ref
+ */
+ public String[] split(CharSequence input, int limit) {
+ ArrayList res = new ArrayList();
+ Matcher mat = matcher(input);
+ int index = 0;
+ int curPos = 0;
+
+ if (input.length() == 0) {
+ return new String [] {""};
+ } else {
+ while (mat.find() && (index + 1 < limit || limit <= 0)) {
+ res.add(input.subSequence(curPos, mat.start()).toString());
+ curPos = mat.end();
+ index++;
+ }
+
+ res.add(input.subSequence(curPos, input.length()).toString());
+ index++;
+
+ /*
+ * discard trailing empty strings
+ */
+ if (limit == 0) {
+ while (--index >= 0 && res.get(index).toString().length() == 0) {
+ res.remove(index);
+ }
+ }
+ }
+ return (String[]) res.toArray(new String[index >= 0 ? index : 0]);
+ }
+
+ /**
+ * @com.intel.drl.spec_ref
+ */
+ public String[] split(CharSequence input) {
+ return split(input, 0);
+ }
+
+ /**
+ * @com.intel.drl.spec_ref
+ */
+ public String pattern() {
+ return lexemes.toString();
+ }
+
+ /**
+ * @com.intel.drl.spec_ref
+ */
+ public String toString() {
+ return this.pattern();
+ }
+
+ /**
+ * @com.intel.drl.spec_ref
+ */
+ public int flags() {
+ return this.flags;
+ }
+
+ /**
+ * @com.intel.drl.spec_ref
+ */
+ public static Pattern compile(String regex, int flags)
+ throws PatternSyntaxException {
+ AbstractSet.counter = 1;
+
+ return new Pattern().compileImpl(regex, flags);
+ }
+
+ /**
+ *
+ * @param pattern -
+ * Regular expression to be compiled
+ * @param flags -
+ * The bit mask including CASE_INSENSITIVE, MULTILINE, DOTALL,
+ * UNICODE_CASE, and CANON_EQ
+ *
+ * @return Compiled pattern
+ */
+ private Pattern compileImpl(String regex, int flags)
+ throws PatternSyntaxException {
+ this.lexemes = new Lexer(regex, flags);
+ this.flags = flags;
+
+ start = processExpression(-1, flags, null);
+ if (!lexemes.isEmpty()) {
+ throw new PatternSyntaxException(I18n
+ .getMessage("Trailing characters"), lexemes.toString(),
+ lexemes.getIndex());
+ }
+
+ return this;
+ }
+
+ /**
+ * A->(a|)+
+ */
+ private AbstractSet processAlternations(AbstractSet last) {
+ CharClass auxRange = new CharClass(hasFlag(Pattern.CASE_INSENSITIVE),
+ hasFlag(Pattern.UNICODE_CASE));
+ while (!lexemes.isEmpty()
+ && lexemes.isLetter()
+ && (lexemes.lookAhead() == 0
+ || lexemes.lookAhead() == Lexer.CHAR_VERTICAL_BAR || lexemes
+ .lookAhead() == Lexer.CHAR_RIGHT_PARENTHESIS)) {
+ auxRange.add(lexemes.next());
+ if (lexemes.peek() == Lexer.CHAR_VERTICAL_BAR)
+ lexemes.next();
+ }
+
+ if (!auxRange.hasUCI()) {
+ return new RangeSet(auxRange, last);
+ } else {
+ return new UCIRangeSet(auxRange, last);
+ }
+ }
+
+ /**
+ * E->AE; E->S|E; E->S; A->(a|)+ E->S(|S)*
+ */
+ private AbstractSet processExpression(int ch, int new_flags,
+ AbstractSet last) {
+ ArrayList children = new ArrayList();
+ AbstractSet child;
+ int safe_flags = flags;
+ FSet fSet;
+
+ if (new_flags != flags) {
+ flags = new_flags;
+ lexemes.setFlags(flags);
+ }
+
+ switch (ch) {
+ case Lexer.CHAR_NONCAP_GROUP: {
+ fSet = new NonCapFSet(++consCount);
+ break;
+ }
+
+ case Lexer.CHAR_POS_LOOKAHEAD:
+ case Lexer.CHAR_NEG_LOOKAHEAD: {
+ fSet = new AheadFSet();
+ break;
+ }
+ case Lexer.CHAR_POS_LOOKBEHIND:
+ case Lexer.CHAR_NEG_LOOKBEHIND: {
+ fSet = new BehindFSet(++consCount);
+ break;
+ }
+
+ case Lexer.CHAR_ATOMIC_GROUP: {
+ fSet = new AtomicFSet(++consCount);
+ break;
+ }
+
+ default: {
+ globalGroupIndex++;
+ if (last == null) {
+ fSet = new FinalSet();
+ // expr = new StartSet();
+ } else {
+ fSet = new FSet(globalGroupIndex);
+ // expr = new JointSet(globalGroupIndex);
+ }
+ }
+ }
+
+ do {
+ if (lexemes.isLetter()
+ && lexemes.lookAhead() == Lexer.CHAR_VERTICAL_BAR) {
+ child = processAlternations(fSet);
+ } else {
+ child = processSubExpression(fSet);
+ if (lexemes.peek() == Lexer.CHAR_VERTICAL_BAR)
+ lexemes.next();
+ }
+ if (child != null)
+ children.add(child);
+ // expr.addChild(child);
+ } while (!(lexemes.isEmpty() || lexemes.peek() == Lexer.CHAR_RIGHT_PARENTHESIS));
+
+ if (flags != safe_flags) {
+ flags = safe_flags;
+ lexemes.setFlags(flags);
+ }
+
+ switch (ch) {
+ case Lexer.CHAR_NONCAP_GROUP: {
+ return new NonCapJointSet(children, fSet);
+ }
+ case Lexer.CHAR_POS_LOOKAHEAD: {
+ return new PositiveLookAhead(children, fSet);
+ }
+
+ case Lexer.CHAR_NEG_LOOKAHEAD: {
+ return new NegativeLookAhead(children, fSet);
+ }
+
+ case Lexer.CHAR_POS_LOOKBEHIND: {
+ return new PositiveLookBehind(children, fSet);
+ }
+
+ case Lexer.CHAR_NEG_LOOKBEHIND: {
+ return new NegativeLookBehind(children, fSet);
+ }
+
+ case Lexer.CHAR_ATOMIC_GROUP: {
+ return new AtomicJointSet(children, fSet);
+ }
+ default: {
+ switch (children.size()) {
+ case 0:
+ return new EmptySet(fSet);
+ case 1:
+ return new SingleSet((AbstractSet) children.get(0), fSet);
+ default:
+ return new JointSet(children, fSet);
+ }
+ }
+ }
+ }
+
+ /**
+ * T->a+
+ */
+ private AbstractSet processSequence(AbstractSet last) {
+ StringBuffer substring = new StringBuffer();
+ while (!lexemes.isEmpty()
+ && lexemes.isLetter()
+ && ((!lexemes.isNextSpecial() && lexemes.lookAhead() == 0) // end
+ // of
+ // pattern
+ || (!lexemes.isNextSpecial() && Lexer.isLetter(lexemes
+ .lookAhead()))
+ || lexemes.lookAhead() == Lexer.CHAR_RIGHT_PARENTHESIS
+ || (lexemes.lookAhead() & 0x8000ffff) == Lexer.CHAR_LEFT_PARENTHESIS
+ || lexemes.lookAhead() == Lexer.CHAR_VERTICAL_BAR || lexemes
+ .lookAhead() == Lexer.CHAR_DOLLAR)) {
+ substring.append((char) lexemes.next());
+ }
+ if (!hasFlag(Pattern.CASE_INSENSITIVE)) {
+ return new SequenceSet(substring);
+ } else if (!hasFlag(Pattern.UNICODE_CASE)) {
+ return new CISequenceSet(substring);
+ } else {
+ return new UCISequenceSet(substring);
+ }
+ }
+
+ /**
+ * S->BS; S->QS; S->Q; B->a+
+ */
+ private AbstractSet processSubExpression(AbstractSet last) {
+ AbstractSet cur;
+ if (lexemes.isLetter() && !lexemes.isNextSpecial()
+ && Lexer.isLetter(lexemes.lookAhead())) {
+ cur = processSequence(last);
+ } else {
+ cur = processQuantifier(last);
+ }
+
+ if (!lexemes.isEmpty()
+ // && !pattern.isQuantifier()
+ && lexemes.peek() != Lexer.CHAR_RIGHT_PARENTHESIS
+ && lexemes.peek() != Lexer.CHAR_VERTICAL_BAR) {
+ AbstractSet next = processSubExpression(last);
+ if (cur instanceof LeafQuantifierSet
+ // TODO create personal UnifiedQuantifierSet for composite
+ // quantifiers
+ // to take into account Quantifier counters
+ // ////
+ && !(cur instanceof CompositeQuantifierSet)
+ && !(cur instanceof GroupQuantifierSet)
+ && !next.first(((LeafQuantifierSet) cur).getInnerSet())) {
+ cur = new UnifiedQuantifierSet((LeafQuantifierSet) cur);
+ }
+ if (((char) next.getType()) == '+') {
+ cur.setNext(((LeafQuantifierSet) next).getInnerSet());
+ } else {
+ cur.setNext(next);
+ }
+ } else if (cur != null) {
+ cur.setNext(last);
+ } else {
+ return null;
+ }
+
+ if (((char) cur.getType()) == '+') {
+ return ((QuantifierSet) cur).getInnerSet();
+ } else {
+ return cur;
+ }
+ }
+
+ /**
+ * Q->T(*|+|?...) also do some optimizations.
+ *
+ */
+ private AbstractSet processQuantifier(AbstractSet last) {
+ AbstractSet term = processTerminal(last);
+ SpecialToken quantifier = null;
+
+ int quant = lexemes.peek();
+
+ if (term != null && !(term instanceof LeafSet)) {
+ switch (quant) {
+ case Lexer.QUANT_STAR:
+ case Lexer.QUANT_PLUS: {
+ lexemes.next();
+ GroupQuantifierSet q = new GroupQuantifierSet(term, last, quant);
+ term.setNext(q);
+ return q;
+ }
+
+ case Lexer.QUANT_STAR_R:
+ case Lexer.QUANT_PLUS_R: {
+ lexemes.next();
+ GroupQuantifierSet q = new ReluctantGroupQuantifierSet(term,
+ last, quant);
+ term.setNext(q);
+ return q;
+ }
+
+ case Lexer.QUANT_PLUS_P: {
+ lexemes.next();
+ // possessive plus will be handled by unique class
+ // and should not be postprocessed to point previous set
+ // to the inner one.
+ // //
+ return new PosPlusGroupQuantifierSet(term, last,
+ Lexer.QUANT_STAR_P);
+ }
+
+ case Lexer.QUANT_STAR_P: {
+ lexemes.next();
+ return new PossessiveGroupQuantifierSet(term, last, quant);
+ }
+
+ case Lexer.QUANT_ALT: {
+ lexemes.next();
+ AltGroupQuantifierSet q = new AltGroupQuantifierSet(term, last,
+ Lexer.QUANT_ALT);
+ term.setNext(last);
+ return q;
+ }
+
+ case Lexer.QUANT_ALT_P: {
+ lexemes.next();
+ return new PosAltGroupQuantifierSet(term, last, Lexer.QUANT_ALT);
+ }
+
+ case Lexer.QUANT_ALT_R: {
+ lexemes.next();
+ RelAltGroupQuantifierSet q = new RelAltGroupQuantifierSet(term,
+ last, Lexer.QUANT_ALT);
+ term.setNext(last);
+ return q;
+ }
+
+ case Lexer.QUANT_COMP: {
+ CompositeGroupQuantifierSet q = new CompositeGroupQuantifierSet(
+ (Quantifier) lexemes.nextSpecial(), term, last,
+ Lexer.QUANT_ALT, ++compCount);
+ term.setNext(q);
+ return q;
+ }
+
+ case Lexer.QUANT_COMP_P: {
+ return new PosCompositeGroupQuantifierSet((Quantifier) lexemes
+ .nextSpecial(), term, last, Lexer.QUANT_ALT,
+ ++compCount);
+ }
+
+ case Lexer.QUANT_COMP_R: {
+ RelCompositeGroupQuantifierSet q = new RelCompositeGroupQuantifierSet(
+ (Quantifier) lexemes.nextSpecial(), term, last,
+ Lexer.QUANT_ALT, ++compCount);
+ term.setNext(q);
+ return q;
+ }
+
+ default:
+ return term;
+ }
+ } else {
+ LeafSet leaf = null;
+ if (term != null)
+ leaf = (LeafSet) term;
+ switch (quant) {
+ case Lexer.QUANT_STAR:
+ case Lexer.QUANT_PLUS: {
+ lexemes.next();
+ LeafQuantifierSet q;
+ if (term.getType() == AbstractSet.TYPE_DOTSET) {
+ if (!hasFlag(Pattern.DOTALL)) {
+ q = new DotQuantifierSet(leaf, last, quant,
+ AbstractLineTerminator.getInstance(flags));
+ } else {
+ q = new DotAllQuantifierSet(leaf, last, quant);
+ }
+ } else {
+ q = new LeafQuantifierSet(leaf, last, quant);
+ }
+ leaf.setNext(q);
+ return q;
+ }
+
+ case Lexer.QUANT_STAR_R:
+ case Lexer.QUANT_PLUS_R: {
+ lexemes.next();
+ ReluctantQuantifierSet q = new ReluctantQuantifierSet(leaf,
+ last, quant);
+ leaf.setNext(q);
+ return q;
+ }
+
+ case Lexer.QUANT_PLUS_P:
+ case Lexer.QUANT_STAR_P: {
+ lexemes.next();
+ PossessiveQuantifierSet q = new PossessiveQuantifierSet(leaf,
+ last, quant);
+ leaf.setNext(q);
+ return q;
+ }
+
+ case Lexer.QUANT_ALT: {
+ lexemes.next();
+ return new AltQuantifierSet(leaf, last, Lexer.QUANT_ALT);
+ }
+
+ case Lexer.QUANT_ALT_P: {
+ lexemes.next();
+ return new PossessiveAltQuantifierSet(leaf, last,
+ Lexer.QUANT_ALT_P);
+ }
+
+ case Lexer.QUANT_ALT_R: {
+ lexemes.next();
+ return new ReluctantAltQuantifierSet(leaf, last,
+ Lexer.QUANT_ALT_R);
+ }
+
+ case Lexer.QUANT_COMP: {
+ return new CompositeQuantifierSet((Quantifier) lexemes
+ .nextSpecial(), leaf, last, Lexer.QUANT_COMP);
+ }
+
+ case Lexer.QUANT_COMP_P: {
+ return new PossessiveCompositeQuantifierSet(
+ (Quantifier) lexemes.nextSpecial(), leaf, last,
+ Lexer.QUANT_COMP_P);
+ }
+ case Lexer.QUANT_COMP_R: {
+ return new ReluctantCompositeQuantifierSet((Quantifier) lexemes
+ .nextSpecial(), leaf, last, Lexer.QUANT_COMP_R);
+ }
+
+ default:
+ return term;
+ }
+ }
+ }
+
+ /**
+ * T-> letter|[range]|{char-class}|(E)
+ */
+ private AbstractSet processTerminal(AbstractSet last) {
+ int ch;
+ AbstractSet term = null;
+ do {
+ ch = lexemes.peek();
+ if ((ch & 0x8000ffff) == Lexer.CHAR_LEFT_PARENTHESIS) {
+ lexemes.next();
+ int new_flags = (ch & 0x00ff0000) >> 16;
+ ch = ch & 0xff00ffff;
+
+ if (ch == Lexer.CHAR_FLAGS) {
+ flags = new_flags;
+ lexemes.setFlags(new_flags);
+ } else {
+ new_flags = (ch == Lexer.CHAR_NONCAP_GROUP) ? new_flags
+ : flags;
+ term = processExpression(ch, new_flags, last);
+ if (lexemes.peek() != Lexer.CHAR_RIGHT_PARENTHESIS)
+ throw new PatternSyntaxException(I18n
+ .getMessage("unmatched ("), lexemes.toString(),
+ lexemes.getIndex());
+ lexemes.next();
+ }
+
+ } else
+ switch (ch) {
+ case Lexer.CHAR_LEFT_SQUARE_BRACKET: {
+ lexemes.next();
+ boolean negative = false;
+ if (lexemes.peek() == Lexer.CHAR_CARET) {
+ negative = true;
+ lexemes.next();
+ }
+
+ term = processRange(negative, last);
+ if (lexemes.peek() != Lexer.CHAR_RIGHT_SQUARE_BRACKET)
+ throw new PatternSyntaxException(I18n
+ .getMessage("unmatched ["), lexemes.toString(),
+ lexemes.getIndex());
+ lexemes.setMode(Lexer.MODE_PATTERN);
+ lexemes.next();
+ break;
+ }
+
+ case Lexer.CHAR_DOT: {
+ lexemes.next();
+
+ if (!hasFlag(Pattern.DOTALL)) {
+ term = new DotSet(AbstractLineTerminator
+ .getInstance(flags));
+ } else {
+ term = new DotAllSet();
+ }
+
+ break;
+ }
+
+ case Lexer.CHAR_CARET: {
+ lexemes.next();
+ consCount++;
+ if (!hasFlag(Pattern.MULTILINE)) {
+ term = new SOLSet();
+ } else {
+ term = new MultiLineSOLSet(AbstractLineTerminator
+ .getInstance(flags));
+ }
+
+ break;
+ }
+
+ case Lexer.CHAR_DOLLAR: {
+ lexemes.next();
+ consCount++;
+ if (!hasFlag(Pattern.MULTILINE)) {
+ if (!hasFlag(Pattern.UNIX_LINES)) {
+ term = new EOLSet(consCount);
+ } else {
+ term = new UEOLSet(consCount);
+ }
+ } else {
+ if (!hasFlag(Pattern.UNIX_LINES)) {
+ term = new MultiLineEOLSet(consCount);
+ } else {
+ term = new UMultiLineEOLSet(consCount);
+ }
+ }
+
+ break;
+ }
+
+ case Lexer.CHAR_WORD_BOUND: {
+ lexemes.next();
+ term = new WordBoundary(true);
+ break;
+ }
+
+ case Lexer.CHAR_NONWORD_BOUND: {
+ lexemes.next();
+ term = new WordBoundary(false);
+ break;
+ }
+
+ case Lexer.CHAR_END_OF_INPUT: {
+ lexemes.next();
+ term = new EOISet();
+ break;
+ }
+
+ case Lexer.CHAR_END_OF_LINE: {
+ lexemes.next();
+ term = new EOLSet(++consCount);
+ break;
+ }
+
+ case Lexer.CHAR_START_OF_INPUT: {
+ lexemes.next();
+ term = new SOLSet();
+ break;
+ }
+
+ case Lexer.CHAR_PREVIOUS_MATCH: {
+ lexemes.next();
+ term = new PreviousMatch();
+ break;
+ }
+
+ case 0x80000000 | '1':
+ case 0x80000000 | '2':
+ case 0x80000000 | '3':
+ case 0x80000000 | '4':
+ case 0x80000000 | '5':
+ case 0x80000000 | '6':
+ case 0x80000000 | '7':
+ case 0x80000000 | '8':
+ case 0x80000000 | '9': {
+ int number = (ch & 0x7FFFFFFF) - '0';
+ if (globalGroupIndex >= number) {
+ lexemes.next();
+ consCount++;
+ if (!hasFlag(Pattern.CASE_INSENSITIVE)) {
+ term = new BackReferenceSet(number, consCount);
+ } else if (!hasFlag(Pattern.UNICODE_CASE)) {
+ term = new CIBackReferenceSet(number, consCount);
+ } else {
+ term = new UCIBackReferenceSet(number, consCount);
+ }
+ break;
+ } else {
+ throw new PatternSyntaxException(I18n
+ .getMessage("No such group yet exists at "
+ + "this point in the pattern"), lexemes
+ .toString(), lexemes.getIndex());
+ }
+ }
+
+ case 0: {
+ AbstractCharClass cc = null;
+ if ((cc = (AbstractCharClass) lexemes.peekSpecial()) != null) {
+ term = new RangeSet(cc);
+ } else if (!lexemes.isEmpty()) {
+ term = new CharSet((char) ch);
+ }
+ lexemes.next();
+ break;
+ }
+
+ default: {
+ if (ch >= 0 && !lexemes.isSpecial()) {
+ if (hasFlag(Pattern.CASE_INSENSITIVE)) {
+ if ((ch >= 'a' && ch <= 'z')
+ || (ch >= 'A' && ch <= 'Z')) {
+ term = new CICharSet((char) ch);
+ } else if (hasFlag(Pattern.UNICODE_CASE)
+ && ch > 128) {
+ term = new UCICharSet((char) ch);
+ } else {
+ term = new CharSet((char) ch);
+ }
+ } else {
+ term = new CharSet((char) ch);
+ }
+ lexemes.next();
+ } else {
+ throw new PatternSyntaxException(I18n
+ .getMessage("Dangling meta construction")
+ + ": "
+ + (lexemes.isSpecial() ? lexemes.peekSpecial()
+ .toString() : Character
+ .toString((char) ch)), lexemes
+ .toString(), lexemes.getIndex());
+ }
+ }
+ }
+ } while (ch == Lexer.CHAR_FLAGS);
+ return term;
+ }
+
+ private AbstractSet processRange(boolean negative, AbstractSet last) {
+ AbstractCharClass res = processRangeExpression(negative);
+ if (!res.hasUCI()) {
+ return new RangeSet(res, last);
+ } else {
+ return new UCIRangeSet(res, last);
+ }
+ }
+
+ /**
+ * proceess [...] ranges
+ */
+ private AbstractCharClass processRangeExpression(boolean alt) {
+ CharClass res = new CharClass(alt, hasFlag(Pattern.CASE_INSENSITIVE),
+ hasFlag(Pattern.UNICODE_CASE));
+ int buffer = -1;
+ // TODO remove this one, being used for debug only
+ int ch = 0;
+ boolean intersection = false;
+ boolean notClosed = false;
+ boolean firstInClass = true;
+
+ while (!lexemes.isEmpty()
+ && (notClosed = (ch = lexemes.peek()) != Lexer.CHAR_RIGHT_SQUARE_BRACKET
+ || firstInClass)) {
+ switch (lexemes.peek()) {
+
+ case Lexer.CHAR_RIGHT_SQUARE_BRACKET: {
+ if (buffer >= 0)
+ res.add(buffer);
+ buffer = ']';
+ lexemes.next();
+ break;
+ }
+ case Lexer.CHAR_LEFT_SQUARE_BRACKET: {
+ lexemes.next();
+ boolean negative = false;
+ if (lexemes.peek() == Lexer.CHAR_CARET) {
+ lexemes.next();
+ negative = true;
+ }
+
+ if (intersection)
+ res.intersection(processRangeExpression(negative));
+ else
+ res.union(processRangeExpression(negative));
+ intersection = false;
+ lexemes.next();
+ break;
+ }
+
+ case Lexer.CHAR_AMPERSAND: {
+ if (buffer >= 0)
+ res.add(buffer);
+ buffer = lexemes.next();
+ // if there is a start for subrange we will do an intersection
+ // otherwise treat '&' as normal character
+ if (lexemes.peek() == Lexer.CHAR_AMPERSAND
+ && lexemes.lookAhead() == Lexer.CHAR_LEFT_SQUARE_BRACKET) {
+ lexemes.next();
+ intersection = true;
+ buffer = -1;
+ }
+
+ break;
+ }
+
+ case Lexer.CHAR_HYPHEN: {
+ if (firstInClass
+ || lexemes.lookAhead() == Lexer.CHAR_RIGHT_SQUARE_BRACKET
+ || lexemes.lookAhead() == Lexer.CHAR_LEFT_SQUARE_BRACKET
+ || buffer < 0) {
+ // treat hypen as normal character
+ if (buffer >= 0)
+ res.add(buffer);
+ buffer = '-';
+ lexemes.next();
+ // range
+ } else {
+ lexemes.next();
+ int cur = lexemes.peek();
+
+ if (!lexemes.isSpecial()
+ && (cur >= 0
+ || lexemes.lookAhead() == Lexer.CHAR_RIGHT_SQUARE_BRACKET
+ || lexemes.lookAhead() == Lexer.CHAR_LEFT_SQUARE_BRACKET || buffer < 0)) {
+
+ try {
+ res.add(buffer, (char) lexemes.peek());
+ } catch (Exception e) {
+ throw new PatternSyntaxException(I18n
+ .getMessage("Illegal character range"),
+ pattern(), lexemes.getIndex());
+ }
+ lexemes.next();
+ buffer = -1;
+ } else {
+ throw new PatternSyntaxException(I18n
+ .getMessage("Illegal character range"),
+ pattern(), lexemes.getIndex());
+ }
+ }
+
+ break;
+ }
+
+ case 0: {
+ if (buffer >= 0)
+ res.add(buffer);
+ AbstractCharClass cs = (AbstractCharClass) lexemes
+ .peekSpecial();
+ if (cs != null) {
+ res.add(cs);
+ buffer = -1;
+ } else {
+ buffer = 0;
+ }
+
+ lexemes.next();
+ break;
+ }
+
+ default: {
+ if (buffer >= 0)
+ res.add(buffer);
+ buffer = lexemes.next();
+ break;
+ }
+ }
+
+ firstInClass = false;
+ }
+ if (notClosed) {
+ throw new PatternSyntaxException(I18n.getMessage("Missing ']'"),
+ pattern(), lexemes.getIndex() - 1);
+ }
+ if (buffer >= 0)
+ res.add(buffer);
+ return res;
+ }
+
+ /**
+ * @com.intel.drl.spec_ref
+ */
+ public static Pattern compile(String pattern) {
+ return compile(pattern, 0);
+ }
+
+ /**
+ * @com.intel.drl.spec_ref
+ */
+ public static boolean matches(String regex, CharSequence input) {
+ return Pattern.compile(regex).matcher(input).matches();
+ }
+
+ public static String quote(String s) {
+ StringBuffer sb = new StringBuffer().append("\\Q");
+ int apos = 0;
+ int k;
+ while ((k = s.indexOf("\\E", apos)) >= 0) {
+ sb.append(s.substring(apos, k + 2)).append("\\\\E\\Q");
+ apos = k + 2;
+ }
+
+ return sb.append(s.substring(apos)).append("\\E").toString();
+ }
+
+ /**
+ * return number of groups found at compile time
+ */
+ int groupCount() {
+ return globalGroupIndex;
+ }
+
+ int compCount() {
+ return this.compCount + 1;
+ }
+
+ int consCount() {
+ return this.consCount + 1;
+ }
+
+ /**
+ * Returns supplementary character. At this time only for ASCII chars.
+ */
+ static char getSupplement(char ch) {
+ char res = ch;
+ if (ch >= 'a' && ch <= 'z') {
+ res -= 32;
+ } else if (ch >= 'A' && ch <= 'Z') {
+ res += 32;
+ }
+
+ return res;
+ }
+
+ /**
+ * @return true if pattern has specified flag
+ */
+ private boolean hasFlag(int flag) {
+ return (flags & flag) == flag;
+ }
+
+ /**
+ * Dismiss public costructor.
+ *
+ */
+ private Pattern() {
+ }
+
+ /**
+ * Serialization support
+ */
+ private void readObject(java.io.ObjectInputStream s)
+ throws java.io.IOException, ClassNotFoundException {
+ s.defaultReadObject();
+ AbstractSet.counter = 1;
+ groupIndex = -1;
+ globalGroupIndex = -1;
+ compCount = -1;
+ consCount = -1;
+
+ compileImpl(pattern, flags);
+
+ }
+}