You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@xalan.apache.org by mu...@apache.org on 2023/04/20 15:44:02 UTC
[xalan-java] branch xalan-j_xslt3.0 updated: committing implementations of xpath 3.1 functions matches and replace
This is an automated email from the ASF dual-hosted git repository.
mukulg pushed a commit to branch xalan-j_xslt3.0
in repository https://gitbox.apache.org/repos/asf/xalan-java.git
The following commit(s) were added to refs/heads/xalan-j_xslt3.0 by this push:
new e453a835 committing implementations of xpath 3.1 functions matches and replace
e453a835 is described below
commit e453a835a8b0cd983df66276bb7a573a457a76ec
Author: Mukul Gandhi <ga...@gmail.com>
AuthorDate: Thu Apr 20 21:13:37 2023 +0530
committing implementations of xpath 3.1 functions matches and replace
---
src/org/apache/xpath/compiler/FunctionTable.java | 21 +-
src/org/apache/xpath/compiler/Keywords.java | 6 +
src/org/apache/xpath/functions/FuncMatches.java | 114 +
src/org/apache/xpath/functions/FuncReplace.java | 115 +
src/org/apache/xpath/functions/Function2Args.java | 5 +-
src/org/apache/xpath/functions/Function3Args.java | 4 +-
.../{Function3Args.java => Function4Args.java} | 73 +-
src/org/apache/xpath/functions/FunctionOneArg.java | 3 +-
.../xpath/functions/RegExFunctionSupport.java | 104 +
src/org/apache/xpath/regex/ASCII.java | 274 +
src/org/apache/xpath/regex/MatchResult.java | 188 +
src/org/apache/xpath/regex/Matcher.java | 1319 +++++
src/org/apache/xpath/regex/Pattern.java | 5250 ++++++++++++++++++++
.../apache/xpath/regex/PatternSyntaxException.java | 124 +
src/org/apache/xpath/regex/UnicodeProp.java | 246 +
src/org/apache/xpath/res/XPATHErrorResources.java | 15 +
.../apache/xpath/res/XPATHErrorResources_sv.java | 5 +-
17 files changed, 7818 insertions(+), 48 deletions(-)
diff --git a/src/org/apache/xpath/compiler/FunctionTable.java b/src/org/apache/xpath/compiler/FunctionTable.java
index 5619882f..2c3746a4 100644
--- a/src/org/apache/xpath/compiler/FunctionTable.java
+++ b/src/org/apache/xpath/compiler/FunctionTable.java
@@ -20,10 +20,9 @@
*/
package org.apache.xpath.compiler;
-import org.apache.xpath.Expression;
-import org.apache.xpath.functions.Function;
import java.util.HashMap;
import javax.xml.transform.TransformerException;
+import org.apache.xpath.functions.Function;
/**
* The function table for XPath.
@@ -132,6 +131,12 @@ public class FunctionTable
/** The 'unparsed-entity-uri()' id (XSLT). */
public static final int FUNC_UNPARSED_ENTITY_URI = 36;
+
+ /** The 'matches()' id. */
+ public static final int FUNC_MATCHES = 37;
+
+ /** The 'replace()' id. */
+ public static final int FUNC_REPLACE = 38;
// Proprietary
@@ -157,10 +162,10 @@ public class FunctionTable
private HashMap m_functionID_customer = new HashMap();
/**
- * Number of built in functions. Be sure to update this as
+ * Number of built in functions. Be sure to update this as
* built-in functions are added.
*/
- private static final int NUM_BUILT_IN_FUNCS = 37;
+ private static final int NUM_BUILT_IN_FUNCS = 39;
/**
* Number of built-in functions that may be added.
@@ -226,6 +231,10 @@ public class FunctionTable
org.apache.xpath.functions.FuncDoclocation.class;
m_functions[FUNC_UNPARSED_ENTITY_URI] =
org.apache.xpath.functions.FuncUnparsedEntityURI.class;
+ m_functions[FUNC_MATCHES] =
+ org.apache.xpath.functions.FuncMatches.class;
+ m_functions[FUNC_REPLACE] =
+ org.apache.xpath.functions.FuncReplace.class;
}
static{
@@ -297,6 +306,10 @@ public class FunctionTable
new Integer(FunctionTable.FUNC_STRING_LENGTH));
m_functionID.put(Keywords.FUNC_UNPARSED_ENTITY_URI_STRING,
new Integer(FunctionTable.FUNC_UNPARSED_ENTITY_URI));
+ m_functionID.put(Keywords.FUNC_MATCHES_STRING,
+ new Integer(FunctionTable.FUNC_MATCHES));
+ m_functionID.put(Keywords.FUNC_REPLACE_STRING,
+ new Integer(FunctionTable.FUNC_REPLACE));
m_functionID.put(Keywords.FUNC_DOCLOCATION_STRING,
new Integer(FunctionTable.FUNC_DOCLOCATION));
}
diff --git a/src/org/apache/xpath/compiler/Keywords.java b/src/org/apache/xpath/compiler/Keywords.java
index 507e62b9..84a159ff 100644
--- a/src/org/apache/xpath/compiler/Keywords.java
+++ b/src/org/apache/xpath/compiler/Keywords.java
@@ -207,6 +207,12 @@ public class Keywords
/** unparsed-entity-uri function string (XSLT). */
public static final String FUNC_UNPARSED_ENTITY_URI_STRING =
"unparsed-entity-uri";
+
+ /** matches function string. */
+ public static final String FUNC_MATCHES_STRING = "matches";
+
+ /** replace function string. */
+ public static final String FUNC_REPLACE_STRING = "replace";
// Proprietary, built in functions
diff --git a/src/org/apache/xpath/functions/FuncMatches.java b/src/org/apache/xpath/functions/FuncMatches.java
new file mode 100644
index 00000000..f38d5a2b
--- /dev/null
+++ b/src/org/apache/xpath/functions/FuncMatches.java
@@ -0,0 +1,114 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Id$
+ */
+package org.apache.xpath.functions;
+
+import javax.xml.transform.SourceLocator;
+
+import org.apache.xalan.res.XSLMessages;
+import org.apache.xml.utils.XMLString;
+import org.apache.xpath.XPathContext;
+import org.apache.xpath.objects.XBoolean;
+import org.apache.xpath.objects.XObject;
+import org.apache.xpath.regex.Matcher;
+import org.apache.xpath.regex.PatternSyntaxException;
+import org.apache.xpath.res.XPATHErrorResources;
+
+/**
+ * Execute the matches() function.
+ *
+ * @author Mukul Gandhi
+ *
+ * @xsl.usage advanced
+ */
+public class FuncMatches extends Function3Args {
+
+ static final long serialVersionUID = 400116356230813776L;
+
+ private static final String FUNCTION_NAME = "matches()";
+
+ /**
+ * Execute the function. The function must return a valid object.
+ *
+ * @param xctxt The current execution context.
+ * @return A valid XObject.
+ *
+ * @throws javax.xml.transform.TransformerException
+ */
+ public XObject execute(XPathContext xctxt) throws javax.xml.transform.TransformerException
+ {
+ SourceLocator srcLocator = xctxt.getSAXLocator();
+
+ XMLString inputStr = m_arg0.execute(xctxt).xstr();
+ XMLString pattern = m_arg1.execute(xctxt).xstr();
+
+ XMLString flags = null;
+
+ if (m_arg2 != null) {
+ flags = m_arg2.execute(xctxt).xstr();
+ if (!RegExFunctionSupport.isFlagStrValid(flags.toString())) {
+ throw new javax.xml.transform.TransformerException(XSLMessages.createXPATHMessage(XPATHErrorResources.
+ ER_INVALID_REGEX_FLAGS, new Object[]{ FUNCTION_NAME }),
+ srcLocator);
+ }
+ }
+
+ boolean result = false;
+
+ try {
+ Matcher matcher = RegExFunctionSupport.regex(RegExFunctionSupport.trfPatternStrForSubtraction(pattern.toString()),
+ flags != null ? flags.toString() : null, inputStr.toString());
+ while (matcher.find()) {
+ result = true;
+ }
+ } catch (PatternSyntaxException ex) {
+ throw new javax.xml.transform.TransformerException(XSLMessages.createXPATHMessage(XPATHErrorResources.
+ ER_INVALID_REGEX, new Object[]{ FUNCTION_NAME }), srcLocator);
+ }
+
+ return result ? XBoolean.S_TRUE : XBoolean.S_FALSE;
+ }
+
+ /**
+ * Check that the number of arguments passed to this function is correct.
+ *
+ * @param argNum The number of arguments that is being passed to the function.
+ *
+ * @throws WrongNumberArgsException
+ */
+ public void checkNumberArgs(int argNum) throws WrongNumberArgsException
+ {
+ if (argNum < 2) {
+ reportWrongNumberArgs();
+ }
+ }
+
+ /**
+ * Constructs and throws a WrongNumberArgException with the appropriate
+ * message for this function object.
+ *
+ * @throws WrongNumberArgsException
+ */
+ protected void reportWrongNumberArgs() throws WrongNumberArgsException {
+ throw new WrongNumberArgsException(XSLMessages.createXPATHMessage(
+ XPATHErrorResources.ER_TWO_OR_THREE, null)); //"2 or 3"
+ }
+
+}
diff --git a/src/org/apache/xpath/functions/FuncReplace.java b/src/org/apache/xpath/functions/FuncReplace.java
new file mode 100644
index 00000000..814dbfd0
--- /dev/null
+++ b/src/org/apache/xpath/functions/FuncReplace.java
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Id$
+ */
+package org.apache.xpath.functions;
+
+import javax.xml.transform.SourceLocator;
+
+import org.apache.xalan.res.XSLMessages;
+import org.apache.xml.utils.XMLString;
+import org.apache.xpath.XPathContext;
+import org.apache.xpath.objects.XObject;
+import org.apache.xpath.objects.XString;
+import org.apache.xpath.regex.Matcher;
+import org.apache.xpath.regex.PatternSyntaxException;
+import org.apache.xpath.res.XPATHErrorResources;
+
+/**
+ * Execute the replace() function.
+ *
+ * @author Mukul Gandhi
+ *
+ * @xsl.usage advanced
+ */
+public class FuncReplace extends Function4Args {
+
+ static final long serialVersionUID = 400116356230813776L;
+
+ private static final String FUNCTION_NAME = "replace()";
+
+ /**
+ * Execute the function. The function must return a valid object.
+ *
+ * @param xctxt The current execution context.
+ * @return A valid XObject.
+ *
+ * @throws javax.xml.transform.TransformerException
+ */
+ public XObject execute(XPathContext xctxt) throws javax.xml.transform.TransformerException
+ {
+ SourceLocator srcLocator = xctxt.getSAXLocator();
+
+ XMLString inputStr = m_arg0.execute(xctxt).xstr();
+ XMLString pattern = m_arg1.execute(xctxt).xstr();
+ XMLString replacement = m_arg2.execute(xctxt).xstr();
+
+ XMLString flags = null;
+
+ if (m_arg3 != null) {
+ flags = m_arg3.execute(xctxt).xstr();
+ if (!RegExFunctionSupport.isFlagStrValid(flags.toString())) {
+ throw new javax.xml.transform.TransformerException(XSLMessages.createXPATHMessage(XPATHErrorResources.
+ ER_INVALID_REGEX_FLAGS, new Object[]{ FUNCTION_NAME }),
+ srcLocator);
+ }
+ }
+
+ String resultStr = null;
+
+ try {
+ Matcher matcher = RegExFunctionSupport.regex(RegExFunctionSupport.trfPatternStrForSubtraction(pattern.toString()),
+ flags != null ? flags.toString() : null, inputStr.toString());
+ resultStr = matcher.replaceAll(replacement.toString());
+ }
+ catch (PatternSyntaxException ex) {
+ throw new javax.xml.transform.TransformerException(XSLMessages.createXPATHMessage(XPATHErrorResources.
+ ER_INVALID_REGEX, new Object[]{ FUNCTION_NAME }),
+ srcLocator);
+ }
+
+ return new XString(resultStr);
+ }
+
+ /**
+ * Check that the number of arguments passed to this function is correct.
+ *
+ * @param argNum The number of arguments that is being passed to the function.
+ *
+ * @throws WrongNumberArgsException
+ */
+ public void checkNumberArgs(int argNum) throws WrongNumberArgsException
+ {
+ if (argNum < 3) {
+ reportWrongNumberArgs();
+ }
+ }
+
+ /**
+ * Constructs and throws a WrongNumberArgException with the appropriate
+ * message for this function object.
+ *
+ * @throws WrongNumberArgsException
+ */
+ protected void reportWrongNumberArgs() throws WrongNumberArgsException {
+ throw new WrongNumberArgsException(XSLMessages.createXPATHMessage(
+ XPATHErrorResources.ER_THREE_OR_FOUR, null)); //"3 or 4"
+ }
+
+}
diff --git a/src/org/apache/xpath/functions/Function2Args.java b/src/org/apache/xpath/functions/Function2Args.java
index 46e3b151..1c40a782 100644
--- a/src/org/apache/xpath/functions/Function2Args.java
+++ b/src/org/apache/xpath/functions/Function2Args.java
@@ -34,7 +34,8 @@ public class Function2Args extends FunctionOneArg
static final long serialVersionUID = 5574294996842710641L;
/** The second argument passed to the function (at index 1).
- * @serial */
+ *
+ */
Expression m_arg1;
/**
@@ -78,8 +79,6 @@ public class Function2Args extends FunctionOneArg
public void setArg(Expression arg, int argNum)
throws WrongNumberArgsException
{
-
- // System.out.println("argNum: "+argNum);
if (argNum == 0)
super.setArg(arg, argNum);
else if (1 == argNum)
diff --git a/src/org/apache/xpath/functions/Function3Args.java b/src/org/apache/xpath/functions/Function3Args.java
index e29fb223..5ac9bc98 100644
--- a/src/org/apache/xpath/functions/Function3Args.java
+++ b/src/org/apache/xpath/functions/Function3Args.java
@@ -34,7 +34,8 @@ public class Function3Args extends Function2Args
static final long serialVersionUID = 7915240747161506646L;
/** The third argument passed to the function (at index 2).
- * @serial */
+ *
+ */
Expression m_arg2;
/**
@@ -77,7 +78,6 @@ public class Function3Args extends Function2Args
public void setArg(Expression arg, int argNum)
throws WrongNumberArgsException
{
-
if (argNum < 2)
super.setArg(arg, argNum);
else if (2 == argNum)
diff --git a/src/org/apache/xpath/functions/Function3Args.java b/src/org/apache/xpath/functions/Function4Args.java
similarity index 73%
copy from src/org/apache/xpath/functions/Function3Args.java
copy to src/org/apache/xpath/functions/Function4Args.java
index e29fb223..16990c1b 100644
--- a/src/org/apache/xpath/functions/Function3Args.java
+++ b/src/org/apache/xpath/functions/Function4Args.java
@@ -26,34 +26,35 @@ import org.apache.xpath.ExpressionOwner;
import org.apache.xpath.XPathVisitor;
/**
- * Base class for functions that accept three arguments.
+ * Base class for functions that accept four arguments.
* @xsl.usage advanced
*/
-public class Function3Args extends Function2Args
+public class Function4Args extends Function3Args
{
- static final long serialVersionUID = 7915240747161506646L;
-
- /** The third argument passed to the function (at index 2).
- * @serial */
- Expression m_arg2;
+ private static final long serialVersionUID = 553916218361619933L;
+
+ /** The third argument passed to the function (at index 3).
+ *
+ */
+ Expression m_arg3;
/**
- * Return the third argument passed to the function (at index 2).
+ * Return the third argument passed to the function (at index 3).
*
- * @return An expression that represents the third argument passed to the
+ * @return An expression that represents the fourth argument passed to the
* function.
*/
- public Expression getArg2()
+ public Expression getArg3()
{
- return m_arg2;
+ return m_arg3;
}
/**
* This function is used to fixup variables from QNames to stack frame
* indexes at stylesheet build time.
- * @param vars List of QNames that correspond to variables. This list
+ * @param vars List of QNames that correspond to variables. This list
* should be searched backwards for the first qualified name that
- * corresponds to the variable reference qname. The position of the
+ * corresponds to the variable reference qname. The position of the
* QName in the vector from the start of the vector will be its position
* in the stack frame (but variables above the globalsTop value will need
* to be offset to the current stack frame).
@@ -61,32 +62,31 @@ public class Function3Args extends Function2Args
public void fixupVariables(java.util.Vector vars, int globalsSize)
{
super.fixupVariables(vars, globalsSize);
- if(null != m_arg2)
- m_arg2.fixupVariables(vars, globalsSize);
+ if (null != m_arg3)
+ m_arg3.fixupVariables(vars, globalsSize);
}
/**
- * Set an argument expression for a function. This method is called by the
+ * Set an argument expression for a function. This method is called by the
* XPath compiler.
*
* @param arg non-null expression that represents the argument.
* @param argNum The argument number index.
*
- * @throws WrongNumberArgsException If the argNum parameter is greater than 2.
+ * @throws WrongNumberArgsException If the argNum parameter is greater than 3.
*/
public void setArg(Expression arg, int argNum)
throws WrongNumberArgsException
{
-
- if (argNum < 2)
+ if (argNum < 3)
super.setArg(arg, argNum);
- else if (2 == argNum)
+ else if (3 == argNum)
{
- m_arg2 = arg;
- arg.exprSetParent(this);
+ m_arg3 = arg;
+ arg.exprSetParent(this);
}
else
- reportWrongNumberArgs();
+ reportWrongNumberArgs();
}
/**
@@ -99,7 +99,7 @@ public class Function3Args extends Function2Args
*/
public void checkNumberArgs(int argNum) throws WrongNumberArgsException
{
- if (argNum != 3)
+ if (argNum != 4)
reportWrongNumberArgs();
}
@@ -110,7 +110,7 @@ public class Function3Args extends Function2Args
* @throws WrongNumberArgsException
*/
protected void reportWrongNumberArgs() throws WrongNumberArgsException {
- throw new WrongNumberArgsException(XSLMessages.createXPATHMessage("three", null));
+ throw new WrongNumberArgsException(XSLMessages.createXPATHMessage("four", null));
}
/**
@@ -122,17 +122,17 @@ public class Function3Args extends Function2Args
public boolean canTraverseOutsideSubtree()
{
return super.canTraverseOutsideSubtree()
- ? true : m_arg2.canTraverseOutsideSubtree();
+ ? true : m_arg3.canTraverseOutsideSubtree();
}
- class Arg2Owner implements ExpressionOwner
+ class Arg3Owner implements ExpressionOwner
{
/**
* @see ExpressionOwner#getExpression()
*/
public Expression getExpression()
{
- return m_arg2;
+ return m_arg3;
}
@@ -141,8 +141,8 @@ public class Function3Args extends Function2Args
*/
public void setExpression(Expression exp)
{
- exp.exprSetParent(Function3Args.this);
- m_arg2 = exp;
+ exp.exprSetParent(Function4Args.this);
+ m_arg3 = exp;
}
}
@@ -153,8 +153,8 @@ public class Function3Args extends Function2Args
public void callArgVisitors(XPathVisitor visitor)
{
super.callArgVisitors(visitor);
- if(null != m_arg2)
- m_arg2.callVisitors(new Arg2Owner(), visitor);
+ if(null != m_arg3)
+ m_arg3.callVisitors(new Arg3Owner(), visitor);
}
/**
@@ -165,19 +165,18 @@ public class Function3Args extends Function2Args
if(!super.deepEquals(expr))
return false;
- if(null != m_arg2)
+ if(null != m_arg3)
{
- if(null == ((Function3Args)expr).m_arg2)
+ if(null == ((Function4Args)expr).m_arg3)
return false;
- if(!m_arg2.deepEquals(((Function3Args)expr).m_arg2))
+ if(!m_arg3.deepEquals(((Function4Args)expr).m_arg3))
return false;
}
- else if (null != ((Function3Args)expr).m_arg2)
+ else if (null != ((Function4Args)expr).m_arg3)
return false;
return true;
}
-
}
diff --git a/src/org/apache/xpath/functions/FunctionOneArg.java b/src/org/apache/xpath/functions/FunctionOneArg.java
index 611be309..acae5402 100644
--- a/src/org/apache/xpath/functions/FunctionOneArg.java
+++ b/src/org/apache/xpath/functions/FunctionOneArg.java
@@ -34,7 +34,8 @@ public class FunctionOneArg extends Function implements ExpressionOwner
static final long serialVersionUID = -5180174180765609758L;
/** The first argument passed to the function (at index 0).
- * @serial */
+ *
+ */
Expression m_arg0;
/**
diff --git a/src/org/apache/xpath/functions/RegExFunctionSupport.java b/src/org/apache/xpath/functions/RegExFunctionSupport.java
new file mode 100644
index 00000000..2dbda7b8
--- /dev/null
+++ b/src/org/apache/xpath/functions/RegExFunctionSupport.java
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Id$
+ */
+package org.apache.xpath.functions;
+
+import org.apache.xpath.regex.Matcher;
+import org.apache.xpath.regex.Pattern;
+
+/**
+ * This class provides supporting implementation, common to all
+ * XPath 3.1 functions requiring regex functionality.
+ *
+ * @author Mukul Gandhi
+ *
+ * @xsl.usage advanced
+ */
+public class RegExFunctionSupport {
+
+ private static final String validRegexflags = "smixq";
+
+ /*
+ * Transform regex pattern input string, to resolve differences between,
+ * XML Schema regex subtraction operator and Java regex subtraction operator.
+ */
+ public static String trfPatternStrForSubtraction(String pattern) {
+ String transformedPatternStr = pattern;
+
+ int indx1 = transformedPatternStr.indexOf("-[");
+ if (indx1 != -1) {
+ String subsPrev = transformedPatternStr.substring(0, indx1);
+ String subsAfter = transformedPatternStr.substring(indx1 + 2);
+ if ((subsPrev.indexOf("[") != -1) && (subsAfter.indexOf("]]") != -1)) {
+ transformedPatternStr = transformedPatternStr.replaceAll("\\-\\[",
+ "&&[^");
+ }
+ }
+
+ return transformedPatternStr;
+ }
+
+ public static Matcher regex(String pattern, String flags, String src) {
+ Matcher matcher = compileAndExecute(pattern, flags, src);
+ return matcher;
+ }
+
+ public static boolean isFlagStrValid(String flags) {
+ boolean flagStrValid = true;
+
+ if (flags.length() > 0) {
+ for (int idx = 0; idx < flags.length(); idx++) {
+ if (validRegexflags.indexOf(flags.charAt(idx)) == -1) {
+ flagStrValid = false;
+ break;
+ }
+ }
+ }
+
+ return flagStrValid;
+ }
+
+ private static Matcher compileAndExecute(String pattern, String flags, String src) {
+ int flag = Pattern.UNIX_LINES;
+
+ if (flags != null) {
+ if (flags.indexOf("s") >= 0) {
+ flag = flag | Pattern.DOTALL;
+ }
+ if (flags.indexOf("m") >= 0) {
+ flag = flag | Pattern.MULTILINE;
+ }
+ if (flags.indexOf("i") >= 0) {
+ flag = flag | Pattern.CASE_INSENSITIVE;
+ }
+ if (flags.indexOf("x") >= 0) {
+ flag = flag | Pattern.IGNORE_WHITESPACE;
+ }
+ if (flags.indexOf("q") >= 0) {
+ flag = flag | Pattern.LITERAL;
+ }
+ }
+
+ Pattern p = Pattern.compile(pattern, flag);
+
+ return p.matcher(src);
+ }
+
+}
diff --git a/src/org/apache/xpath/regex/ASCII.java b/src/org/apache/xpath/regex/ASCII.java
new file mode 100644
index 00000000..34a65ece
--- /dev/null
+++ b/src/org/apache/xpath/regex/ASCII.java
@@ -0,0 +1,274 @@
+/*
+ * Copyright (c) 1999, 2000, Oracle and/or its affiliates. All rights reserved.
+ * ORACLE PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ */
+
+package org.apache.xpath.regex;
+
+
+/**
+ * Utility class that implements the standard C ctype functionality.
+ *
+ * @author Hong Zhang
+ */
+
+final class ASCII {
+
+ static final int UPPER = 0x00000100;
+
+ static final int LOWER = 0x00000200;
+
+ static final int DIGIT = 0x00000400;
+
+ static final int SPACE = 0x00000800;
+
+ static final int PUNCT = 0x00001000;
+
+ static final int CNTRL = 0x00002000;
+
+ static final int BLANK = 0x00004000;
+
+ static final int HEX = 0x00008000;
+
+ static final int UNDER = 0x00010000;
+
+ static final int ASCII = 0x0000FF00;
+
+ static final int ALPHA = (UPPER|LOWER);
+
+ static final int ALNUM = (UPPER|LOWER|DIGIT);
+
+ static final int GRAPH = (PUNCT|UPPER|LOWER|DIGIT);
+
+ static final int WORD = (UPPER|LOWER|UNDER|DIGIT);
+
+ static final int XDIGIT = (HEX);
+
+ private static final int[] ctype = new int[] {
+ CNTRL, /* 00 (NUL) */
+ CNTRL, /* 01 (SOH) */
+ CNTRL, /* 02 (STX) */
+ CNTRL, /* 03 (ETX) */
+ CNTRL, /* 04 (EOT) */
+ CNTRL, /* 05 (ENQ) */
+ CNTRL, /* 06 (ACK) */
+ CNTRL, /* 07 (BEL) */
+ CNTRL, /* 08 (BS) */
+ SPACE+CNTRL+BLANK, /* 09 (HT) */
+ SPACE+CNTRL, /* 0A (LF) */
+ SPACE+CNTRL, /* 0B (VT) */
+ SPACE+CNTRL, /* 0C (FF) */
+ SPACE+CNTRL, /* 0D (CR) */
+ CNTRL, /* 0E (SI) */
+ CNTRL, /* 0F (SO) */
+ CNTRL, /* 10 (DLE) */
+ CNTRL, /* 11 (DC1) */
+ CNTRL, /* 12 (DC2) */
+ CNTRL, /* 13 (DC3) */
+ CNTRL, /* 14 (DC4) */
+ CNTRL, /* 15 (NAK) */
+ CNTRL, /* 16 (SYN) */
+ CNTRL, /* 17 (ETB) */
+ CNTRL, /* 18 (CAN) */
+ CNTRL, /* 19 (EM) */
+ CNTRL, /* 1A (SUB) */
+ CNTRL, /* 1B (ESC) */
+ CNTRL, /* 1C (FS) */
+ CNTRL, /* 1D (GS) */
+ CNTRL, /* 1E (RS) */
+ CNTRL, /* 1F (US) */
+ SPACE+BLANK, /* 20 SPACE */
+ PUNCT, /* 21 ! */
+ PUNCT, /* 22 " */
+ PUNCT, /* 23 # */
+ PUNCT, /* 24 $ */
+ PUNCT, /* 25 % */
+ PUNCT, /* 26 & */
+ PUNCT, /* 27 ' */
+ PUNCT, /* 28 ( */
+ PUNCT, /* 29 ) */
+ PUNCT, /* 2A * */
+ PUNCT, /* 2B + */
+ PUNCT, /* 2C , */
+ PUNCT, /* 2D - */
+ PUNCT, /* 2E . */
+ PUNCT, /* 2F / */
+ DIGIT+HEX+0, /* 30 0 */
+ DIGIT+HEX+1, /* 31 1 */
+ DIGIT+HEX+2, /* 32 2 */
+ DIGIT+HEX+3, /* 33 3 */
+ DIGIT+HEX+4, /* 34 4 */
+ DIGIT+HEX+5, /* 35 5 */
+ DIGIT+HEX+6, /* 36 6 */
+ DIGIT+HEX+7, /* 37 7 */
+ DIGIT+HEX+8, /* 38 8 */
+ DIGIT+HEX+9, /* 39 9 */
+ PUNCT, /* 3A : */
+ PUNCT, /* 3B ; */
+ PUNCT, /* 3C < */
+ PUNCT, /* 3D = */
+ PUNCT, /* 3E > */
+ PUNCT, /* 3F ? */
+ PUNCT, /* 40 @ */
+ UPPER+HEX+10, /* 41 A */
+ UPPER+HEX+11, /* 42 B */
+ UPPER+HEX+12, /* 43 C */
+ UPPER+HEX+13, /* 44 D */
+ UPPER+HEX+14, /* 45 E */
+ UPPER+HEX+15, /* 46 F */
+ UPPER+16, /* 47 G */
+ UPPER+17, /* 48 H */
+ UPPER+18, /* 49 I */
+ UPPER+19, /* 4A J */
+ UPPER+20, /* 4B K */
+ UPPER+21, /* 4C L */
+ UPPER+22, /* 4D M */
+ UPPER+23, /* 4E N */
+ UPPER+24, /* 4F O */
+ UPPER+25, /* 50 P */
+ UPPER+26, /* 51 Q */
+ UPPER+27, /* 52 R */
+ UPPER+28, /* 53 S */
+ UPPER+29, /* 54 T */
+ UPPER+30, /* 55 U */
+ UPPER+31, /* 56 V */
+ UPPER+32, /* 57 W */
+ UPPER+33, /* 58 X */
+ UPPER+34, /* 59 Y */
+ UPPER+35, /* 5A Z */
+ PUNCT, /* 5B [ */
+ PUNCT, /* 5C \ */
+ PUNCT, /* 5D ] */
+ PUNCT, /* 5E ^ */
+ PUNCT|UNDER, /* 5F _ */
+ PUNCT, /* 60 ` */
+ LOWER+HEX+10, /* 61 a */
+ LOWER+HEX+11, /* 62 b */
+ LOWER+HEX+12, /* 63 c */
+ LOWER+HEX+13, /* 64 d */
+ LOWER+HEX+14, /* 65 e */
+ LOWER+HEX+15, /* 66 f */
+ LOWER+16, /* 67 g */
+ LOWER+17, /* 68 h */
+ LOWER+18, /* 69 i */
+ LOWER+19, /* 6A j */
+ LOWER+20, /* 6B k */
+ LOWER+21, /* 6C l */
+ LOWER+22, /* 6D m */
+ LOWER+23, /* 6E n */
+ LOWER+24, /* 6F o */
+ LOWER+25, /* 70 p */
+ LOWER+26, /* 71 q */
+ LOWER+27, /* 72 r */
+ LOWER+28, /* 73 s */
+ LOWER+29, /* 74 t */
+ LOWER+30, /* 75 u */
+ LOWER+31, /* 76 v */
+ LOWER+32, /* 77 w */
+ LOWER+33, /* 78 x */
+ LOWER+34, /* 79 y */
+ LOWER+35, /* 7A z */
+ PUNCT, /* 7B { */
+ PUNCT, /* 7C | */
+ PUNCT, /* 7D } */
+ PUNCT, /* 7E ~ */
+ CNTRL, /* 7F (DEL) */
+ };
+
+ static int getType(int ch) {
+ return ((ch & 0xFFFFFF80) == 0 ? ctype[ch] : 0);
+ }
+
+ static boolean isType(int ch, int type) {
+ return (getType(ch) & type) != 0;
+ }
+
+ static boolean isAscii(int ch) {
+ return ((ch & 0xFFFFFF80) == 0);
+ }
+
+ static boolean isAlpha(int ch) {
+ return isType(ch, ALPHA);
+ }
+
+ static boolean isDigit(int ch) {
+ return ((ch-'0')|('9'-ch)) >= 0;
+ }
+
+ static boolean isAlnum(int ch) {
+ return isType(ch, ALNUM);
+ }
+
+ static boolean isGraph(int ch) {
+ return isType(ch, GRAPH);
+ }
+
+ static boolean isPrint(int ch) {
+ return ((ch-0x20)|(0x7E-ch)) >= 0;
+ }
+
+ static boolean isPunct(int ch) {
+ return isType(ch, PUNCT);
+ }
+
+ static boolean isSpace(int ch) {
+ return isType(ch, SPACE);
+ }
+
+ static boolean isHexDigit(int ch) {
+ return isType(ch, HEX);
+ }
+
+ static boolean isOctDigit(int ch) {
+ return ((ch-'0')|('7'-ch)) >= 0;
+ }
+
+ static boolean isCntrl(int ch) {
+ return isType(ch, CNTRL);
+ }
+
+ static boolean isLower(int ch) {
+ return ((ch-'a')|('z'-ch)) >= 0;
+ }
+
+ static boolean isUpper(int ch) {
+ return ((ch-'A')|('Z'-ch)) >= 0;
+ }
+
+ static boolean isWord(int ch) {
+ return isType(ch, WORD);
+ }
+
+ static int toDigit(int ch) {
+ return (ctype[ch & 0x7F] & 0x3F);
+ }
+
+ static int toLower(int ch) {
+ return isUpper(ch) ? (ch + 0x20) : ch;
+ }
+
+ static int toUpper(int ch) {
+ return isLower(ch) ? (ch - 0x20) : ch;
+ }
+
+}
diff --git a/src/org/apache/xpath/regex/MatchResult.java b/src/org/apache/xpath/regex/MatchResult.java
new file mode 100644
index 00000000..f1d37d9a
--- /dev/null
+++ b/src/org/apache/xpath/regex/MatchResult.java
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2003, 2013, Oracle and/or its affiliates. All rights reserved.
+ * ORACLE PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ */
+
+package org.apache.xpath.regex;
+
+/**
+ * The result of a match operation.
+ *
+ * <p>This interface contains query methods used to determine the
+ * results of a match against a regular expression. The match boundaries,
+ * groups and group boundaries can be seen but not modified through
+ * a <code>MatchResult</code>.
+ *
+ * @author Michael McCloskey
+ * @see Matcher
+ * @since 1.5
+ */
+public interface MatchResult {
+
+ /**
+ * Returns the start index of the match.
+ *
+ * @return The index of the first character matched
+ *
+ * @throws IllegalStateException
+ * If no match has yet been attempted,
+ * or if the previous match operation failed
+ */
+ public int start();
+
+ /**
+ * Returns the start index of the subsequence captured by the given group
+ * during this match.
+ *
+ * <p> <a href="Pattern.html#cg">Capturing groups</a> are indexed from left
+ * to right, starting at one. Group zero denotes the entire pattern, so
+ * the expression <i>m.</i><tt>start(0)</tt> is equivalent to
+ * <i>m.</i><tt>start()</tt>. </p>
+ *
+ * @param group
+ * The index of a capturing group in this matcher's pattern
+ *
+ * @return The index of the first character captured by the group,
+ * or <tt>-1</tt> if the match was successful but the group
+ * itself did not match anything
+ *
+ * @throws IllegalStateException
+ * If no match has yet been attempted,
+ * or if the previous match operation failed
+ *
+ * @throws IndexOutOfBoundsException
+ * If there is no capturing group in the pattern
+ * with the given index
+ */
+ public int start(int group);
+
+ /**
+ * Returns the offset after the last character matched.
+ *
+ * @return The offset after the last character matched
+ *
+ * @throws IllegalStateException
+ * If no match has yet been attempted,
+ * or if the previous match operation failed
+ */
+ public int end();
+
+ /**
+ * Returns the offset after the last character of the subsequence
+ * captured by the given group during this match.
+ *
+ * <p> <a href="Pattern.html#cg">Capturing groups</a> are indexed from left
+ * to right, starting at one. Group zero denotes the entire pattern, so
+ * the expression <i>m.</i><tt>end(0)</tt> is equivalent to
+ * <i>m.</i><tt>end()</tt>. </p>
+ *
+ * @param group
+ * The index of a capturing group in this matcher's pattern
+ *
+ * @return The offset after the last character captured by the group,
+ * or <tt>-1</tt> if the match was successful
+ * but the group itself did not match anything
+ *
+ * @throws IllegalStateException
+ * If no match has yet been attempted,
+ * or if the previous match operation failed
+ *
+ * @throws IndexOutOfBoundsException
+ * If there is no capturing group in the pattern
+ * with the given index
+ */
+ public int end(int group);
+
+ /**
+ * Returns the input subsequence matched by the previous match.
+ *
+ * <p> For a matcher <i>m</i> with input sequence <i>s</i>,
+ * the expressions <i>m.</i><tt>group()</tt> and
+ * <i>s.</i><tt>substring(</tt><i>m.</i><tt>start(),</tt> <i>m.</i><tt>end())</tt>
+ * are equivalent. </p>
+ *
+ * <p> Note that some patterns, for example <tt>a*</tt>, match the empty
+ * string. This method will return the empty string when the pattern
+ * successfully matches the empty string in the input. </p>
+ *
+ * @return The (possibly empty) subsequence matched by the previous match,
+ * in string form
+ *
+ * @throws IllegalStateException
+ * If no match has yet been attempted,
+ * or if the previous match operation failed
+ */
+ public String group();
+
+ /**
+ * Returns the input subsequence captured by the given group during the
+ * previous match operation.
+ *
+ * <p> For a matcher <i>m</i>, input sequence <i>s</i>, and group index
+ * <i>g</i>, the expressions <i>m.</i><tt>group(</tt><i>g</i><tt>)</tt> and
+ * <i>s.</i><tt>substring(</tt><i>m.</i><tt>start(</tt><i>g</i><tt>),</tt> <i>m.</i><tt>end(</tt><i>g</i><tt>))</tt>
+ * are equivalent. </p>
+ *
+ * <p> <a href="Pattern.html#cg">Capturing groups</a> are indexed from left
+ * to right, starting at one. Group zero denotes the entire pattern, so
+ * the expression <tt>m.group(0)</tt> is equivalent to <tt>m.group()</tt>.
+ * </p>
+ *
+ * <p> If the match was successful but the group specified failed to match
+ * any part of the input sequence, then <tt>null</tt> is returned. Note
+ * that some groups, for example <tt>(a*)</tt>, match the empty string.
+ * This method will return the empty string when such a group successfully
+ * matches the empty string in the input. </p>
+ *
+ * @param group
+ * The index of a capturing group in this matcher's pattern
+ *
+ * @return The (possibly empty) subsequence captured by the group
+ * during the previous match, or <tt>null</tt> if the group
+ * failed to match part of the input
+ *
+ * @throws IllegalStateException
+ * If no match has yet been attempted,
+ * or if the previous match operation failed
+ *
+ * @throws IndexOutOfBoundsException
+ * If there is no capturing group in the pattern
+ * with the given index
+ */
+ public String group(int group);
+
+ /**
+ * Returns the number of capturing groups in this match result's pattern.
+ *
+ * <p> Group zero denotes the entire pattern by convention. It is not
+ * included in this count.
+ *
+ * <p> Any non-negative integer smaller than or equal to the value
+ * returned by this method is guaranteed to be a valid group index for
+ * this matcher. </p>
+ *
+ * @return The number of capturing groups in this matcher's pattern
+ */
+ public int groupCount();
+
+}
diff --git a/src/org/apache/xpath/regex/Matcher.java b/src/org/apache/xpath/regex/Matcher.java
new file mode 100644
index 00000000..a6494607
--- /dev/null
+++ b/src/org/apache/xpath/regex/Matcher.java
@@ -0,0 +1,1319 @@
+/*
+ * Copyright (c) 1999, 2013, Oracle and/or its affiliates. All rights reserved.
+ * ORACLE PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ */
+
+package org.apache.xpath.regex;
+
+import java.util.Objects;
+
+/**
+ * An engine that performs match operations on a {@linkplain java.lang.CharSequence
+ * character sequence} by interpreting a {@link Pattern}.
+ *
+ * <p> A matcher is created from a pattern by invoking the pattern's {@link
+ * Pattern#matcher matcher} method. Once created, a matcher can be used to
+ * perform three different kinds of match operations:
+ *
+ * <ul>
+ *
+ * <li><p> The {@link #matches matches} method attempts to match the entire
+ * input sequence against the pattern. </p></li>
+ *
+ * <li><p> The {@link #lookingAt lookingAt} method attempts to match the
+ * input sequence, starting at the beginning, against the pattern. </p></li>
+ *
+ * <li><p> The {@link #find find} method scans the input sequence looking for
+ * the next subsequence that matches the pattern. </p></li>
+ *
+ * </ul>
+ *
+ * <p> Each of these methods returns a boolean indicating success or failure.
+ * More information about a successful match can be obtained by querying the
+ * state of the matcher.
+ *
+ * <p> A matcher finds matches in a subset of its input called the
+ * <i>region</i>. By default, the region contains all of the matcher's input.
+ * The region can be modified via the{@link #region region} method and queried
+ * via the {@link #regionStart regionStart} and {@link #regionEnd regionEnd}
+ * methods. The way that the region boundaries interact with some pattern
+ * constructs can be changed. See {@link #useAnchoringBounds
+ * useAnchoringBounds} and {@link #useTransparentBounds useTransparentBounds}
+ * for more details.
+ *
+ * <p> This class also defines methods for replacing matched subsequences with
+ * new strings whose contents can, if desired, be computed from the match
+ * result. The {@link #appendReplacement appendReplacement} and {@link
+ * #appendTail appendTail} methods can be used in tandem in order to collect
+ * the result into an existing string buffer, or the more convenient {@link
+ * #replaceAll replaceAll} method can be used to create a string in which every
+ * matching subsequence in the input sequence is replaced.
+ *
+ * <p> The explicit state of a matcher includes the start and end indices of
+ * the most recent successful match. It also includes the start and end
+ * indices of the input subsequence captured by each <a
+ * href="Pattern.html#cg">capturing group</a> in the pattern as well as a total
+ * count of such subsequences. As a convenience, methods are also provided for
+ * returning these captured subsequences in string form.
+ *
+ * <p> The explicit state of a matcher is initially undefined; attempting to
+ * query any part of it before a successful match will cause an {@link
+ * IllegalStateException} to be thrown. The explicit state of a matcher is
+ * recomputed by every match operation.
+ *
+ * <p> The implicit state of a matcher includes the input character sequence as
+ * well as the <i>append position</i>, which is initially zero and is updated
+ * by the {@link #appendReplacement appendReplacement} method.
+ *
+ * <p> A matcher may be reset explicitly by invoking its {@link #reset()}
+ * method or, if a new input sequence is desired, its {@link
+ * #reset(java.lang.CharSequence) reset(CharSequence)} method. Resetting a
+ * matcher discards its explicit state information and sets the append position
+ * to zero.
+ *
+ * <p> Instances of this class are not safe for use by multiple concurrent
+ * threads. </p>
+ *
+ *
+ * @author Mike McCloskey
+ * @author Mark Reinhold
+ * @author JSR-51 Expert Group
+ * @since 1.4
+ * @spec JSR-51
+ */
+
+public final class Matcher implements MatchResult {
+
+ /**
+ * The Pattern object that created this Matcher.
+ */
+ Pattern parentPattern;
+
+ /**
+ * The storage used by groups. They may contain invalid values if
+ * a group was skipped during the matching.
+ */
+ int[] groups;
+
+ /**
+ * The range within the sequence that is to be matched. Anchors
+ * will match at these "hard" boundaries. Changing the region
+ * changes these values.
+ */
+ int from, to;
+
+ /**
+ * Lookbehind uses this value to ensure that the subexpression
+ * match ends at the point where the lookbehind was encountered.
+ */
+ int lookbehindTo;
+
+ /**
+ * The original string being matched.
+ */
+ CharSequence text;
+
+ /**
+ * Matcher state used by the last node. NOANCHOR is used when a
+ * match does not have to consume all of the input. ENDANCHOR is
+ * the mode used for matching all the input.
+ */
+ static final int ENDANCHOR = 1;
+ static final int NOANCHOR = 0;
+ int acceptMode = NOANCHOR;
+
+ /**
+ * The range of string that last matched the pattern. If the last
+ * match failed then first is -1; last initially holds 0 then it
+ * holds the index of the end of the last match (which is where the
+ * next search starts).
+ */
+ int first = -1, last = 0;
+
+ /**
+ * The end index of what matched in the last match operation.
+ */
+ int oldLast = -1;
+
+ /**
+ * The index of the last position appended in a substitution.
+ */
+ int lastAppendPosition = 0;
+
+ /**
+ * Storage used by nodes to tell what repetition they are on in
+ * a pattern, and where groups begin. The nodes themselves are stateless,
+ * so they rely on this field to hold state during a match.
+ */
+ int[] locals;
+
+ /**
+ * Boolean indicating whether or not more input could change
+ * the results of the last match.
+ *
+ * If hitEnd is true, and a match was found, then more input
+ * might cause a different match to be found.
+ * If hitEnd is true and a match was not found, then more
+ * input could cause a match to be found.
+ * If hitEnd is false and a match was found, then more input
+ * will not change the match.
+ * If hitEnd is false and a match was not found, then more
+ * input will not cause a match to be found.
+ */
+ boolean hitEnd;
+
+ /**
+ * Boolean indicating whether or not more input could change
+ * a positive match into a negative one.
+ *
+ * If requireEnd is true, and a match was found, then more
+ * input could cause the match to be lost.
+ * If requireEnd is false and a match was found, then more
+ * input might change the match but the match won't be lost.
+ * If a match was not found, then requireEnd has no meaning.
+ */
+ boolean requireEnd;
+
+ /**
+ * If transparentBounds is true then the boundaries of this
+ * matcher's region are transparent to lookahead, lookbehind,
+ * and boundary matching constructs that try to see beyond them.
+ */
+ boolean transparentBounds = false;
+
+ /**
+ * If anchoringBounds is true then the boundaries of this
+ * matcher's region match anchors such as ^ and $.
+ */
+ boolean anchoringBounds = true;
+
+ /**
+ * No default constructor.
+ */
+ Matcher() {
+ }
+
+ /**
+ * All matchers have the state used by Pattern during a match.
+ */
+ Matcher(Pattern parent, CharSequence text) {
+ this.parentPattern = parent;
+ this.text = text;
+
+ // Allocate state storage
+ int parentGroupCount = Math.max(parent.capturingGroupCount, 10);
+ groups = new int[parentGroupCount * 2];
+ locals = new int[parent.localCount];
+
+ // Put fields into initial states
+ reset();
+ }
+
+ /**
+ * Returns the pattern that is interpreted by this matcher.
+ *
+ * @return The pattern for which this matcher was created
+ */
+ public Pattern pattern() {
+ return parentPattern;
+ }
+
+ /**
+ * Returns the match state of this matcher as a {@link MatchResult}.
+ * The result is unaffected by subsequent operations performed upon this
+ * matcher.
+ *
+ * @return a <code>MatchResult</code> with the state of this matcher
+ * @since 1.5
+ */
+ public MatchResult toMatchResult() {
+ Matcher result = new Matcher(this.parentPattern, text.toString());
+ result.first = this.first;
+ result.last = this.last;
+ result.groups = this.groups.clone();
+ return result;
+ }
+
+ /**
+ * Changes the <tt>Pattern</tt> that this <tt>Matcher</tt> uses to
+ * find matches with.
+ *
+ * <p> This method causes this matcher to lose information
+ * about the groups of the last match that occurred. The
+ * matcher's position in the input is maintained and its
+ * last append position is unaffected.</p>
+ *
+ * @param newPattern
+ * The new pattern used by this matcher
+ * @return This matcher
+ * @throws IllegalArgumentException
+ * If newPattern is <tt>null</tt>
+ * @since 1.5
+ */
+ public Matcher usePattern(Pattern newPattern) {
+ if (newPattern == null)
+ throw new IllegalArgumentException("Pattern cannot be null");
+ parentPattern = newPattern;
+
+ // Reallocate state storage
+ int parentGroupCount = Math.max(newPattern.capturingGroupCount, 10);
+ groups = new int[parentGroupCount * 2];
+ locals = new int[newPattern.localCount];
+ for (int i = 0; i < groups.length; i++)
+ groups[i] = -1;
+ for (int i = 0; i < locals.length; i++)
+ locals[i] = -1;
+ return this;
+ }
+
+ /**
+ * Resets this matcher.
+ *
+ * <p> Resetting a matcher discards all of its explicit state information
+ * and sets its append position to zero. The matcher's region is set to the
+ * default region, which is its entire character sequence. The anchoring
+ * and transparency of this matcher's region boundaries are unaffected.
+ *
+ * @return This matcher
+ */
+ public Matcher reset() {
+ first = -1;
+ last = 0;
+ oldLast = -1;
+ for(int i=0; i<groups.length; i++)
+ groups[i] = -1;
+ for(int i=0; i<locals.length; i++)
+ locals[i] = -1;
+ lastAppendPosition = 0;
+ from = 0;
+ to = getTextLength();
+ return this;
+ }
+
+ /**
+ * Resets this matcher with a new input sequence.
+ *
+ * <p> Resetting a matcher discards all of its explicit state information
+ * and sets its append position to zero. The matcher's region is set to
+ * the default region, which is its entire character sequence. The
+ * anchoring and transparency of this matcher's region boundaries are
+ * unaffected.
+ *
+ * @param input
+ * The new input character sequence
+ *
+ * @return This matcher
+ */
+ public Matcher reset(CharSequence input) {
+ text = input;
+ return reset();
+ }
+
+ /**
+ * Returns the start index of the previous match.
+ *
+ * @return The index of the first character matched
+ *
+ * @throws IllegalStateException
+ * If no match has yet been attempted,
+ * or if the previous match operation failed
+ */
+ public int start() {
+ if (first < 0)
+ throw new IllegalStateException("No match available");
+ return first;
+ }
+
+ /**
+ * Returns the start index of the subsequence captured by the given group
+ * during the previous match operation.
+ *
+ * <p> <a href="Pattern.html#cg">Capturing groups</a> are indexed from left
+ * to right, starting at one. Group zero denotes the entire pattern, so
+ * the expression <i>m.</i><tt>start(0)</tt> is equivalent to
+ * <i>m.</i><tt>start()</tt>. </p>
+ *
+ * @param group
+ * The index of a capturing group in this matcher's pattern
+ *
+ * @return The index of the first character captured by the group,
+ * or <tt>-1</tt> if the match was successful but the group
+ * itself did not match anything
+ *
+ * @throws IllegalStateException
+ * If no match has yet been attempted,
+ * or if the previous match operation failed
+ *
+ * @throws IndexOutOfBoundsException
+ * If there is no capturing group in the pattern
+ * with the given index
+ */
+ public int start(int group) {
+ if (first < 0)
+ throw new IllegalStateException("No match available");
+ if (group < 0 || group > groupCount())
+ throw new IndexOutOfBoundsException("No group " + group);
+ return groups[group * 2];
+ }
+
+ /**
+ * Returns the start index of the subsequence captured by the given
+ * <a href="Pattern.html#groupname">named-capturing group</a> during the
+ * previous match operation.
+ *
+ * @param name
+ * The name of a named-capturing group in this matcher's pattern
+ *
+ * @return The index of the first character captured by the group,
+ * or {@code -1} if the match was successful but the group
+ * itself did not match anything
+ *
+ * @throws IllegalStateException
+ * If no match has yet been attempted,
+ * or if the previous match operation failed
+ *
+ * @throws IllegalArgumentException
+ * If there is no capturing group in the pattern
+ * with the given name
+ * @since 1.8
+ */
+ public int start(String name) {
+ return groups[getMatchedGroupIndex(name) * 2];
+ }
+
+ /**
+ * Returns the offset after the last character matched.
+ *
+ * @return The offset after the last character matched
+ *
+ * @throws IllegalStateException
+ * If no match has yet been attempted,
+ * or if the previous match operation failed
+ */
+ public int end() {
+ if (first < 0)
+ throw new IllegalStateException("No match available");
+ return last;
+ }
+
+ /**
+ * Returns the offset after the last character of the subsequence
+ * captured by the given group during the previous match operation.
+ *
+ * <p> <a href="Pattern.html#cg">Capturing groups</a> are indexed from left
+ * to right, starting at one. Group zero denotes the entire pattern, so
+ * the expression <i>m.</i><tt>end(0)</tt> is equivalent to
+ * <i>m.</i><tt>end()</tt>. </p>
+ *
+ * @param group
+ * The index of a capturing group in this matcher's pattern
+ *
+ * @return The offset after the last character captured by the group,
+ * or <tt>-1</tt> if the match was successful
+ * but the group itself did not match anything
+ *
+ * @throws IllegalStateException
+ * If no match has yet been attempted,
+ * or if the previous match operation failed
+ *
+ * @throws IndexOutOfBoundsException
+ * If there is no capturing group in the pattern
+ * with the given index
+ */
+ public int end(int group) {
+ if (first < 0)
+ throw new IllegalStateException("No match available");
+ if (group < 0 || group > groupCount())
+ throw new IndexOutOfBoundsException("No group " + group);
+ return groups[group * 2 + 1];
+ }
+
+ /**
+ * Returns the offset after the last character of the subsequence
+ * captured by the given <a href="Pattern.html#groupname">named-capturing
+ * group</a> during the previous match operation.
+ *
+ * @param name
+ * The name of a named-capturing group in this matcher's pattern
+ *
+ * @return The offset after the last character captured by the group,
+ * or {@code -1} if the match was successful
+ * but the group itself did not match anything
+ *
+ * @throws IllegalStateException
+ * If no match has yet been attempted,
+ * or if the previous match operation failed
+ *
+ * @throws IllegalArgumentException
+ * If there is no capturing group in the pattern
+ * with the given name
+ * @since 1.8
+ */
+ public int end(String name) {
+ return groups[getMatchedGroupIndex(name) * 2 + 1];
+ }
+
+ /**
+ * Returns the input subsequence matched by the previous match.
+ *
+ * <p> For a matcher <i>m</i> with input sequence <i>s</i>,
+ * the expressions <i>m.</i><tt>group()</tt> and
+ * <i>s.</i><tt>substring(</tt><i>m.</i><tt>start(),</tt> <i>m.</i><tt>end())</tt>
+ * are equivalent. </p>
+ *
+ * <p> Note that some patterns, for example <tt>a*</tt>, match the empty
+ * string. This method will return the empty string when the pattern
+ * successfully matches the empty string in the input. </p>
+ *
+ * @return The (possibly empty) subsequence matched by the previous match,
+ * in string form
+ *
+ * @throws IllegalStateException
+ * If no match has yet been attempted,
+ * or if the previous match operation failed
+ */
+ public String group() {
+ return group(0);
+ }
+
+ /**
+ * Returns the input subsequence captured by the given group during the
+ * previous match operation.
+ *
+ * <p> For a matcher <i>m</i>, input sequence <i>s</i>, and group index
+ * <i>g</i>, the expressions <i>m.</i><tt>group(</tt><i>g</i><tt>)</tt> and
+ * <i>s.</i><tt>substring(</tt><i>m.</i><tt>start(</tt><i>g</i><tt>),</tt> <i>m.</i><tt>end(</tt><i>g</i><tt>))</tt>
+ * are equivalent. </p>
+ *
+ * <p> <a href="Pattern.html#cg">Capturing groups</a> are indexed from left
+ * to right, starting at one. Group zero denotes the entire pattern, so
+ * the expression <tt>m.group(0)</tt> is equivalent to <tt>m.group()</tt>.
+ * </p>
+ *
+ * <p> If the match was successful but the group specified failed to match
+ * any part of the input sequence, then <tt>null</tt> is returned. Note
+ * that some groups, for example <tt>(a*)</tt>, match the empty string.
+ * This method will return the empty string when such a group successfully
+ * matches the empty string in the input. </p>
+ *
+ * @param group
+ * The index of a capturing group in this matcher's pattern
+ *
+ * @return The (possibly empty) subsequence captured by the group
+ * during the previous match, or <tt>null</tt> if the group
+ * failed to match part of the input
+ *
+ * @throws IllegalStateException
+ * If no match has yet been attempted,
+ * or if the previous match operation failed
+ *
+ * @throws IndexOutOfBoundsException
+ * If there is no capturing group in the pattern
+ * with the given index
+ */
+ public String group(int group) {
+ if (first < 0)
+ throw new IllegalStateException("No match found");
+ if (group < 0 || group > groupCount())
+ throw new IndexOutOfBoundsException("No group " + group);
+ if ((groups[group*2] == -1) || (groups[group*2+1] == -1))
+ return null;
+ return getSubSequence(groups[group * 2], groups[group * 2 + 1]).toString();
+ }
+
+ /**
+ * Returns the input subsequence captured by the given
+ * <a href="Pattern.html#groupname">named-capturing group</a> during the previous
+ * match operation.
+ *
+ * <p> If the match was successful but the group specified failed to match
+ * any part of the input sequence, then <tt>null</tt> is returned. Note
+ * that some groups, for example <tt>(a*)</tt>, match the empty string.
+ * This method will return the empty string when such a group successfully
+ * matches the empty string in the input. </p>
+ *
+ * @param name
+ * The name of a named-capturing group in this matcher's pattern
+ *
+ * @return The (possibly empty) subsequence captured by the named group
+ * during the previous match, or <tt>null</tt> if the group
+ * failed to match part of the input
+ *
+ * @throws IllegalStateException
+ * If no match has yet been attempted,
+ * or if the previous match operation failed
+ *
+ * @throws IllegalArgumentException
+ * If there is no capturing group in the pattern
+ * with the given name
+ * @since 1.7
+ */
+ public String group(String name) {
+ int group = getMatchedGroupIndex(name);
+ if ((groups[group*2] == -1) || (groups[group*2+1] == -1))
+ return null;
+ return getSubSequence(groups[group * 2], groups[group * 2 + 1]).toString();
+ }
+
+ /**
+ * Returns the number of capturing groups in this matcher's pattern.
+ *
+ * <p> Group zero denotes the entire pattern by convention. It is not
+ * included in this count.
+ *
+ * <p> Any non-negative integer smaller than or equal to the value
+ * returned by this method is guaranteed to be a valid group index for
+ * this matcher. </p>
+ *
+ * @return The number of capturing groups in this matcher's pattern
+ */
+ public int groupCount() {
+ return parentPattern.capturingGroupCount - 1;
+ }
+
+ /**
+ * Attempts to match the entire region against the pattern.
+ *
+ * <p> If the match succeeds then more information can be obtained via the
+ * <tt>start</tt>, <tt>end</tt>, and <tt>group</tt> methods. </p>
+ *
+ * @return <tt>true</tt> if, and only if, the entire region sequence
+ * matches this matcher's pattern
+ */
+ public boolean matches() {
+ return match(from, ENDANCHOR);
+ }
+
+ /**
+ * Attempts to find the next subsequence of the input sequence that matches
+ * the pattern.
+ *
+ * <p> This method starts at the beginning of this matcher's region, or, if
+ * a previous invocation of the method was successful and the matcher has
+ * not since been reset, at the first character not matched by the previous
+ * match.
+ *
+ * <p> If the match succeeds then more information can be obtained via the
+ * <tt>start</tt>, <tt>end</tt>, and <tt>group</tt> methods. </p>
+ *
+ * @return <tt>true</tt> if, and only if, a subsequence of the input
+ * sequence matches this matcher's pattern
+ */
+ public boolean find() {
+ int nextSearchIndex = last;
+ if (nextSearchIndex == first)
+ nextSearchIndex++;
+
+ // If next search starts before region, start it at region
+ if (nextSearchIndex < from)
+ nextSearchIndex = from;
+
+ // If next search starts beyond region then it fails
+ if (nextSearchIndex > to) {
+ for (int i = 0; i < groups.length; i++)
+ groups[i] = -1;
+ return false;
+ }
+ return search(nextSearchIndex);
+ }
+
+ /**
+ * Resets this matcher and then attempts to find the next subsequence of
+ * the input sequence that matches the pattern, starting at the specified
+ * index.
+ *
+ * <p> If the match succeeds then more information can be obtained via the
+ * <tt>start</tt>, <tt>end</tt>, and <tt>group</tt> methods, and subsequent
+ * invocations of the {@link #find()} method will start at the first
+ * character not matched by this match. </p>
+ *
+ * @param start the index to start searching for a match
+ * @throws IndexOutOfBoundsException
+ * If start is less than zero or if start is greater than the
+ * length of the input sequence.
+ *
+ * @return <tt>true</tt> if, and only if, a subsequence of the input
+ * sequence starting at the given index matches this matcher's
+ * pattern
+ */
+ public boolean find(int start) {
+ int limit = getTextLength();
+ if ((start < 0) || (start > limit))
+ throw new IndexOutOfBoundsException("Illegal start index");
+ reset();
+ return search(start);
+ }
+
+ /**
+ * Attempts to match the input sequence, starting at the beginning of the
+ * region, against the pattern.
+ *
+ * <p> Like the {@link #matches matches} method, this method always starts
+ * at the beginning of the region; unlike that method, it does not
+ * require that the entire region be matched.
+ *
+ * <p> If the match succeeds then more information can be obtained via the
+ * <tt>start</tt>, <tt>end</tt>, and <tt>group</tt> methods. </p>
+ *
+ * @return <tt>true</tt> if, and only if, a prefix of the input
+ * sequence matches this matcher's pattern
+ */
+ public boolean lookingAt() {
+ return match(from, NOANCHOR);
+ }
+
+ /**
+ * Returns a literal replacement <code>String</code> for the specified
+ * <code>String</code>.
+ *
+ * This method produces a <code>String</code> that will work
+ * as a literal replacement <code>s</code> in the
+ * <code>appendReplacement</code> method of the {@link Matcher} class.
+ * The <code>String</code> produced will match the sequence of characters
+ * in <code>s</code> treated as a literal sequence. Slashes ('\') and
+ * dollar signs ('$') will be given no special meaning.
+ *
+ * @param s The string to be literalized
+ * @return A literal string replacement
+ * @since 1.5
+ */
+ public static String quoteReplacement(String s) {
+ if ((s.indexOf('\\') == -1) && (s.indexOf('$') == -1))
+ return s;
+ StringBuilder sb = new StringBuilder();
+ for (int i=0; i<s.length(); i++) {
+ char c = s.charAt(i);
+ if (c == '\\' || c == '$') {
+ sb.append('\\');
+ }
+ sb.append(c);
+ }
+ return sb.toString();
+ }
+
+ /**
+ * Implements a non-terminal append-and-replace step.
+ *
+ * <p> This method performs the following actions: </p>
+ *
+ * <ol>
+ *
+ * <li><p> It reads characters from the input sequence, starting at the
+ * append position, and appends them to the given string buffer. It
+ * stops after reading the last character preceding the previous match,
+ * that is, the character at index {@link
+ * #start()} <tt>-</tt> <tt>1</tt>. </p></li>
+ *
+ * <li><p> It appends the given replacement string to the string buffer.
+ * </p></li>
+ *
+ * <li><p> It sets the append position of this matcher to the index of
+ * the last character matched, plus one, that is, to {@link #end()}.
+ * </p></li>
+ *
+ * </ol>
+ *
+ * <p> The replacement string may contain references to subsequences
+ * captured during the previous match: Each occurrence of
+ * <tt>${</tt><i>name</i><tt>}</tt> or <tt>$</tt><i>g</i>
+ * will be replaced by the result of evaluating the corresponding
+ * {@link #group(String) group(name)} or {@link #group(int) group(g)}
+ * respectively. For <tt>$</tt><i>g</i>,
+ * the first number after the <tt>$</tt> is always treated as part of
+ * the group reference. Subsequent numbers are incorporated into g if
+ * they would form a legal group reference. Only the numerals '0'
+ * through '9' are considered as potential components of the group
+ * reference. If the second group matched the string <tt>"foo"</tt>, for
+ * example, then passing the replacement string <tt>"$2bar"</tt> would
+ * cause <tt>"foobar"</tt> to be appended to the string buffer. A dollar
+ * sign (<tt>$</tt>) may be included as a literal in the replacement
+ * string by preceding it with a backslash (<tt>\$</tt>).
+ *
+ * <p> Note that backslashes (<tt>\</tt>) and dollar signs (<tt>$</tt>) in
+ * the replacement string may cause the results to be different than if it
+ * were being treated as a literal replacement string. Dollar signs may be
+ * treated as references to captured subsequences as described above, and
+ * backslashes are used to escape literal characters in the replacement
+ * string.
+ *
+ * <p> This method is intended to be used in a loop together with the
+ * {@link #appendTail appendTail} and {@link #find find} methods. The
+ * following code, for example, writes <tt>one dog two dogs in the
+ * yard</tt> to the standard-output stream: </p>
+ *
+ * <blockquote><pre>
+ * Pattern p = Pattern.compile("cat");
+ * Matcher m = p.matcher("one cat two cats in the yard");
+ * StringBuffer sb = new StringBuffer();
+ * while (m.find()) {
+ * m.appendReplacement(sb, "dog");
+ * }
+ * m.appendTail(sb);
+ * System.out.println(sb.toString());</pre></blockquote>
+ *
+ * @param sb
+ * The target string buffer
+ *
+ * @param replacement
+ * The replacement string
+ *
+ * @return This matcher
+ *
+ * @throws IllegalStateException
+ * If no match has yet been attempted,
+ * or if the previous match operation failed
+ *
+ * @throws IllegalArgumentException
+ * If the replacement string refers to a named-capturing
+ * group that does not exist in the pattern
+ *
+ * @throws IndexOutOfBoundsException
+ * If the replacement string refers to a capturing group
+ * that does not exist in the pattern
+ */
+ public Matcher appendReplacement(StringBuffer sb, String replacement) {
+
+ // If no match, return error
+ if (first < 0)
+ throw new IllegalStateException("No match available");
+
+ // Process substitution string to replace group references with groups
+ int cursor = 0;
+ StringBuilder result = new StringBuilder();
+
+ while (cursor < replacement.length()) {
+ char nextChar = replacement.charAt(cursor);
+ if (nextChar == '\\') {
+ cursor++;
+ if (cursor == replacement.length())
+ throw new IllegalArgumentException(
+ "character to be escaped is missing");
+ nextChar = replacement.charAt(cursor);
+ result.append(nextChar);
+ cursor++;
+ } else if (nextChar == '$') {
+ // Skip past $
+ cursor++;
+ // Throw IAE if this "$" is the last character in replacement
+ if (cursor == replacement.length())
+ throw new IllegalArgumentException(
+ "Illegal group reference: group index is missing");
+ nextChar = replacement.charAt(cursor);
+ int refNum = -1;
+ if (nextChar == '{') {
+ cursor++;
+ StringBuilder gsb = new StringBuilder();
+ while (cursor < replacement.length()) {
+ nextChar = replacement.charAt(cursor);
+ if (ASCII.isLower(nextChar) ||
+ ASCII.isUpper(nextChar) ||
+ ASCII.isDigit(nextChar)) {
+ gsb.append(nextChar);
+ cursor++;
+ } else {
+ break;
+ }
+ }
+ if (gsb.length() == 0)
+ throw new IllegalArgumentException(
+ "named capturing group has 0 length name");
+ if (nextChar != '}')
+ throw new IllegalArgumentException(
+ "named capturing group is missing trailing '}'");
+ String gname = gsb.toString();
+ if (ASCII.isDigit(gname.charAt(0)))
+ throw new IllegalArgumentException(
+ "capturing group name {" + gname +
+ "} starts with digit character");
+ if (!parentPattern.namedGroups().containsKey(gname))
+ throw new IllegalArgumentException(
+ "No group with name {" + gname + "}");
+ refNum = parentPattern.namedGroups().get(gname);
+ cursor++;
+ } else {
+ // The first number is always a group
+ refNum = (int)nextChar - '0';
+ if ((refNum < 0)||(refNum > 9))
+ throw new IllegalArgumentException(
+ "Illegal group reference");
+ cursor++;
+ // Capture the largest legal group string
+ boolean done = false;
+ while (!done) {
+ if (cursor >= replacement.length()) {
+ break;
+ }
+ int nextDigit = replacement.charAt(cursor) - '0';
+ if ((nextDigit < 0)||(nextDigit > 9)) { // not a number
+ break;
+ }
+ int newRefNum = (refNum * 10) + nextDigit;
+ if (groupCount() < newRefNum) {
+ done = true;
+ } else {
+ refNum = newRefNum;
+ cursor++;
+ }
+ }
+ }
+ // Append group
+ if (start(refNum) != -1 && end(refNum) != -1)
+ result.append(text, start(refNum), end(refNum));
+ } else {
+ result.append(nextChar);
+ cursor++;
+ }
+ }
+ // Append the intervening text
+ sb.append(text, lastAppendPosition, first);
+ // Append the match substitution
+ sb.append(result);
+
+ lastAppendPosition = last;
+ return this;
+ }
+
+ /**
+ * Implements a terminal append-and-replace step.
+ *
+ * <p> This method reads characters from the input sequence, starting at
+ * the append position, and appends them to the given string buffer. It is
+ * intended to be invoked after one or more invocations of the {@link
+ * #appendReplacement appendReplacement} method in order to copy the
+ * remainder of the input sequence. </p>
+ *
+ * @param sb
+ * The target string buffer
+ *
+ * @return The target string buffer
+ */
+ public StringBuffer appendTail(StringBuffer sb) {
+ sb.append(text, lastAppendPosition, getTextLength());
+ return sb;
+ }
+
+ /**
+ * Replaces every subsequence of the input sequence that matches the
+ * pattern with the given replacement string.
+ *
+ * <p> This method first resets this matcher. It then scans the input
+ * sequence looking for matches of the pattern. Characters that are not
+ * part of any match are appended directly to the result string; each match
+ * is replaced in the result by the replacement string. The replacement
+ * string may contain references to captured subsequences as in the {@link
+ * #appendReplacement appendReplacement} method.
+ *
+ * <p> Note that backslashes (<tt>\</tt>) and dollar signs (<tt>$</tt>) in
+ * the replacement string may cause the results to be different than if it
+ * were being treated as a literal replacement string. Dollar signs may be
+ * treated as references to captured subsequences as described above, and
+ * backslashes are used to escape literal characters in the replacement
+ * string.
+ *
+ * <p> Given the regular expression <tt>a*b</tt>, the input
+ * <tt>"aabfooaabfooabfoob"</tt>, and the replacement string
+ * <tt>"-"</tt>, an invocation of this method on a matcher for that
+ * expression would yield the string <tt>"-foo-foo-foo-"</tt>.
+ *
+ * <p> Invoking this method changes this matcher's state. If the matcher
+ * is to be used in further matching operations then it should first be
+ * reset. </p>
+ *
+ * @param replacement
+ * The replacement string
+ *
+ * @return The string constructed by replacing each matching subsequence
+ * by the replacement string, substituting captured subsequences
+ * as needed
+ */
+ public String replaceAll(String replacement) {
+ reset();
+ boolean result = find();
+ if (result) {
+ StringBuffer sb = new StringBuffer();
+ do {
+ appendReplacement(sb, replacement);
+ result = find();
+ } while (result);
+ appendTail(sb);
+ return sb.toString();
+ }
+ return text.toString();
+ }
+
+ /**
+ * Replaces the first subsequence of the input sequence that matches the
+ * pattern with the given replacement string.
+ *
+ * <p> This method first resets this matcher. It then scans the input
+ * sequence looking for a match of the pattern. Characters that are not
+ * part of the match are appended directly to the result string; the match
+ * is replaced in the result by the replacement string. The replacement
+ * string may contain references to captured subsequences as in the {@link
+ * #appendReplacement appendReplacement} method.
+ *
+ * <p>Note that backslashes (<tt>\</tt>) and dollar signs (<tt>$</tt>) in
+ * the replacement string may cause the results to be different than if it
+ * were being treated as a literal replacement string. Dollar signs may be
+ * treated as references to captured subsequences as described above, and
+ * backslashes are used to escape literal characters in the replacement
+ * string.
+ *
+ * <p> Given the regular expression <tt>dog</tt>, the input
+ * <tt>"zzzdogzzzdogzzz"</tt>, and the replacement string
+ * <tt>"cat"</tt>, an invocation of this method on a matcher for that
+ * expression would yield the string <tt>"zzzcatzzzdogzzz"</tt>. </p>
+ *
+ * <p> Invoking this method changes this matcher's state. If the matcher
+ * is to be used in further matching operations then it should first be
+ * reset. </p>
+ *
+ * @param replacement
+ * The replacement string
+ * @return The string constructed by replacing the first matching
+ * subsequence by the replacement string, substituting captured
+ * subsequences as needed
+ */
+ public String replaceFirst(String replacement) {
+ if (replacement == null)
+ throw new NullPointerException("replacement");
+ reset();
+ if (!find())
+ return text.toString();
+ StringBuffer sb = new StringBuffer();
+ appendReplacement(sb, replacement);
+ appendTail(sb);
+ return sb.toString();
+ }
+
+ /**
+ * Sets the limits of this matcher's region. The region is the part of the
+ * input sequence that will be searched to find a match. Invoking this
+ * method resets the matcher, and then sets the region to start at the
+ * index specified by the <code>start</code> parameter and end at the
+ * index specified by the <code>end</code> parameter.
+ *
+ * <p>Depending on the transparency and anchoring being used (see
+ * {@link #useTransparentBounds useTransparentBounds} and
+ * {@link #useAnchoringBounds useAnchoringBounds}), certain constructs such
+ * as anchors may behave differently at or around the boundaries of the
+ * region.
+ *
+ * @param start
+ * The index to start searching at (inclusive)
+ * @param end
+ * The index to end searching at (exclusive)
+ * @throws IndexOutOfBoundsException
+ * If start or end is less than zero, if
+ * start is greater than the length of the input sequence, if
+ * end is greater than the length of the input sequence, or if
+ * start is greater than end.
+ * @return this matcher
+ * @since 1.5
+ */
+ public Matcher region(int start, int end) {
+ if ((start < 0) || (start > getTextLength()))
+ throw new IndexOutOfBoundsException("start");
+ if ((end < 0) || (end > getTextLength()))
+ throw new IndexOutOfBoundsException("end");
+ if (start > end)
+ throw new IndexOutOfBoundsException("start > end");
+ reset();
+ from = start;
+ to = end;
+ return this;
+ }
+
+ /**
+ * Reports the start index of this matcher's region. The
+ * searches this matcher conducts are limited to finding matches
+ * within {@link #regionStart regionStart} (inclusive) and
+ * {@link #regionEnd regionEnd} (exclusive).
+ *
+ * @return The starting point of this matcher's region
+ * @since 1.5
+ */
+ public int regionStart() {
+ return from;
+ }
+
+ /**
+ * Reports the end index (exclusive) of this matcher's region.
+ * The searches this matcher conducts are limited to finding matches
+ * within {@link #regionStart regionStart} (inclusive) and
+ * {@link #regionEnd regionEnd} (exclusive).
+ *
+ * @return the ending point of this matcher's region
+ * @since 1.5
+ */
+ public int regionEnd() {
+ return to;
+ }
+
+ /**
+ * Queries the transparency of region bounds for this matcher.
+ *
+ * <p> This method returns <tt>true</tt> if this matcher uses
+ * <i>transparent</i> bounds, <tt>false</tt> if it uses <i>opaque</i>
+ * bounds.
+ *
+ * <p> See {@link #useTransparentBounds useTransparentBounds} for a
+ * description of transparent and opaque bounds.
+ *
+ * <p> By default, a matcher uses opaque region boundaries.
+ *
+ * @return <tt>true</tt> iff this matcher is using transparent bounds,
+ * <tt>false</tt> otherwise.
+ * @see java.util.regex.Matcher#useTransparentBounds(boolean)
+ * @since 1.5
+ */
+ public boolean hasTransparentBounds() {
+ return transparentBounds;
+ }
+
+ /**
+ * Sets the transparency of region bounds for this matcher.
+ *
+ * <p> Invoking this method with an argument of <tt>true</tt> will set this
+ * matcher to use <i>transparent</i> bounds. If the boolean
+ * argument is <tt>false</tt>, then <i>opaque</i> bounds will be used.
+ *
+ * <p> Using transparent bounds, the boundaries of this
+ * matcher's region are transparent to lookahead, lookbehind,
+ * and boundary matching constructs. Those constructs can see beyond the
+ * boundaries of the region to see if a match is appropriate.
+ *
+ * <p> Using opaque bounds, the boundaries of this matcher's
+ * region are opaque to lookahead, lookbehind, and boundary matching
+ * constructs that may try to see beyond them. Those constructs cannot
+ * look past the boundaries so they will fail to match anything outside
+ * of the region.
+ *
+ * <p> By default, a matcher uses opaque bounds.
+ *
+ * @param b a boolean indicating whether to use opaque or transparent
+ * regions
+ * @return this matcher
+ * @see java.util.regex.Matcher#hasTransparentBounds
+ * @since 1.5
+ */
+ public Matcher useTransparentBounds(boolean b) {
+ transparentBounds = b;
+ return this;
+ }
+
+ /**
+ * Queries the anchoring of region bounds for this matcher.
+ *
+ * <p> This method returns <tt>true</tt> if this matcher uses
+ * <i>anchoring</i> bounds, <tt>false</tt> otherwise.
+ *
+ * <p> See {@link #useAnchoringBounds useAnchoringBounds} for a
+ * description of anchoring bounds.
+ *
+ * <p> By default, a matcher uses anchoring region boundaries.
+ *
+ * @return <tt>true</tt> iff this matcher is using anchoring bounds,
+ * <tt>false</tt> otherwise.
+ * @see java.util.regex.Matcher#useAnchoringBounds(boolean)
+ * @since 1.5
+ */
+ public boolean hasAnchoringBounds() {
+ return anchoringBounds;
+ }
+
+ /**
+ * Sets the anchoring of region bounds for this matcher.
+ *
+ * <p> Invoking this method with an argument of <tt>true</tt> will set this
+ * matcher to use <i>anchoring</i> bounds. If the boolean
+ * argument is <tt>false</tt>, then <i>non-anchoring</i> bounds will be
+ * used.
+ *
+ * <p> Using anchoring bounds, the boundaries of this
+ * matcher's region match anchors such as ^ and $.
+ *
+ * <p> Without anchoring bounds, the boundaries of this
+ * matcher's region will not match anchors such as ^ and $.
+ *
+ * <p> By default, a matcher uses anchoring region boundaries.
+ *
+ * @param b a boolean indicating whether or not to use anchoring bounds.
+ * @return this matcher
+ * @see java.util.regex.Matcher#hasAnchoringBounds
+ * @since 1.5
+ */
+ public Matcher useAnchoringBounds(boolean b) {
+ anchoringBounds = b;
+ return this;
+ }
+
+ /**
+ * <p>Returns the string representation of this matcher. The
+ * string representation of a <code>Matcher</code> contains information
+ * that may be useful for debugging. The exact format is unspecified.
+ *
+ * @return The string representation of this matcher
+ * @since 1.5
+ */
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ sb.append("org.eclipse.wst.xml.xpath2.regex.Matcher");
+ sb.append("[pattern=" + pattern());
+ sb.append(" region=");
+ sb.append(regionStart() + "," + regionEnd());
+ sb.append(" lastmatch=");
+ if ((first >= 0) && (group() != null)) {
+ sb.append(group());
+ }
+ sb.append("]");
+ return sb.toString();
+ }
+
+ /**
+ * <p>Returns true if the end of input was hit by the search engine in
+ * the last match operation performed by this matcher.
+ *
+ * <p>When this method returns true, then it is possible that more input
+ * would have changed the result of the last search.
+ *
+ * @return true iff the end of input was hit in the last match; false
+ * otherwise
+ * @since 1.5
+ */
+ public boolean hitEnd() {
+ return hitEnd;
+ }
+
+ /**
+ * <p>Returns true if more input could change a positive match into a
+ * negative one.
+ *
+ * <p>If this method returns true, and a match was found, then more
+ * input could cause the match to be lost. If this method returns false
+ * and a match was found, then more input might change the match but the
+ * match won't be lost. If a match was not found, then requireEnd has no
+ * meaning.
+ *
+ * @return true iff more input could change a positive match into a
+ * negative one.
+ * @since 1.5
+ */
+ public boolean requireEnd() {
+ return requireEnd;
+ }
+
+ /**
+ * Initiates a search to find a Pattern within the given bounds.
+ * The groups are filled with default values and the match of the root
+ * of the state machine is called. The state machine will hold the state
+ * of the match as it proceeds in this matcher.
+ *
+ * Matcher.from is not set here, because it is the "hard" boundary
+ * of the start of the search which anchors will set to. The from param
+ * is the "soft" boundary of the start of the search, meaning that the
+ * regex tries to match at that index but ^ won't match there. Subsequent
+ * calls to the search methods start at a new "soft" boundary which is
+ * the end of the previous match.
+ */
+ boolean search(int from) {
+ this.hitEnd = false;
+ this.requireEnd = false;
+ from = from < 0 ? 0 : from;
+ this.first = from;
+ this.oldLast = oldLast < 0 ? from : oldLast;
+ for (int i = 0; i < groups.length; i++)
+ groups[i] = -1;
+ acceptMode = NOANCHOR;
+ boolean result = parentPattern.root.match(this, from, text);
+ if (!result)
+ this.first = -1;
+ this.oldLast = this.last;
+ return result;
+ }
+
+ /**
+ * Initiates a search for an anchored match to a Pattern within the given
+ * bounds. The groups are filled with default values and the match of the
+ * root of the state machine is called. The state machine will hold the
+ * state of the match as it proceeds in this matcher.
+ */
+ boolean match(int from, int anchor) {
+ this.hitEnd = false;
+ this.requireEnd = false;
+ from = from < 0 ? 0 : from;
+ this.first = from;
+ this.oldLast = oldLast < 0 ? from : oldLast;
+ for (int i = 0; i < groups.length; i++)
+ groups[i] = -1;
+ acceptMode = anchor;
+ boolean result = parentPattern.matchRoot.match(this, from, text);
+ if (!result)
+ this.first = -1;
+ this.oldLast = this.last;
+ return result;
+ }
+
+ /**
+ * Returns the end index of the text.
+ *
+ * @return the index after the last character in the text
+ */
+ int getTextLength() {
+ return text.length();
+ }
+
+ /**
+ * Generates a String from this Matcher's input in the specified range.
+ *
+ * @param beginIndex the beginning index, inclusive
+ * @param endIndex the ending index, exclusive
+ * @return A String generated from this Matcher's input
+ */
+ CharSequence getSubSequence(int beginIndex, int endIndex) {
+ return text.subSequence(beginIndex, endIndex);
+ }
+
+ /**
+ * Returns this Matcher's input character at index i.
+ *
+ * @return A char from the specified index
+ */
+ char charAt(int i) {
+ return text.charAt(i);
+ }
+
+ /**
+ * Returns the group index of the matched capturing group.
+ *
+ * @return the index of the named-capturing group
+ */
+ int getMatchedGroupIndex(String name) {
+ Objects.requireNonNull(name, "Group name");
+ if (first < 0)
+ throw new IllegalStateException("No match found");
+ if (!parentPattern.namedGroups().containsKey(name))
+ throw new IllegalArgumentException("No group with name <" + name + ">");
+ return parentPattern.namedGroups().get(name);
+ }
+}
diff --git a/src/org/apache/xpath/regex/Pattern.java b/src/org/apache/xpath/regex/Pattern.java
new file mode 100644
index 00000000..7aad934a
--- /dev/null
+++ b/src/org/apache/xpath/regex/Pattern.java
@@ -0,0 +1,5250 @@
+/*
+ * Copyright (c) 1999, 2019, Oracle and/or its affiliates. All rights reserved.
+ * ORACLE PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ */
+
+package org.apache.xpath.regex;
+
+import java.text.Normalizer;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.xerces.util.XML11Char;
+
+/**
+ * A compiled representation of a regular expression.
+ *
+ * @author Mike McCloskey
+ * @author Mark Reinhold
+ * @author JSR-51 Expert Group
+ * @since 1.4
+ * @spec JSR-51
+ */
+/*
+ * @modification notes: Mukul Gandhi (https://xalan.apache.org/xalan-j/index.html).
+ *
+ * Minor modifications to this class provided by JDK 1.8, were done to make it compliant
+ * to XPath 3.1 regex syntax (https://www.w3.org/TR/xpath-functions-31/#regex-syntax).
+ *
+ * The following, XPath 3.1 related modifications to this class were done:
+ *
+ * 1) The COMMENTS flag was renamed to IGNORE_WHITESPACE. The behavior of this
+ * flag was changed, to not recognize regex inline comments starting with
+ * character # when XPath 3.1 regex flag "x" is provided. When XPath 3.1 regex
+ * flag "x" is provided, only whitespaces within regex are ignored, for an
+ * XPath 3.1 processor based on this class.
+ *
+ * 2) Java allows, escape characters 'x' and 'u' within regex to denote unicode code
+ * points. But XPath 3.1 / XSD regex syntax, doesn't allow these escape characters.
+ * Modification to this class, was done to not allow escape characters 'x' and 'u'
+ * within regex. To specify, unicode code points within XPath 3.1 / XSD regex's,
+ * the syntax like   can be used.
+ *
+ * Few other, escape characters allowed by Java regex syntax, are disallowed by
+ * XPath 3.1 / XSD regex syntax. Modification to this class, was done not to
+ * allow those other escape characters as well.
+ *
+ * Modification to this class were done, to comply with XPath 3.1 / XSD regex
+ * definitions of following multi-character escapes : ., \s, \S.
+ *
+ * Implemented the XPath 3.1 / XSD regex multi-character escape sequences
+ * \i, \I, \c, \C.
+ *
+ * 3) Modification to this class were done, to support XPath 3.1 / XSD
+ * unicode block escape expressions, like \p{IsBasicLatin}, \p{IsLatin-1Supplement} etc.
+ *
+ * Other than above mentioned modifications, this class behaves exactly like the
+ * Java class : java.util.regex.Pattern.
+ *
+ * This class is intended, to be used only for an XPath 3.1 implementation. For any other
+ * Java application requiring regex's, the class java.util.regex.Pattern provided by JDK
+ * must be used.
+ */
+public final class Pattern
+ implements java.io.Serializable
+{
+
+ /**
+ * Regular expression modifier values. Instead of being passed as
+ * arguments, they can also be passed as inline modifiers.
+ * For example, the following statements have the same effect.
+ * <pre>
+ * RegExp r1 = RegExp.compile("abc", Pattern.I|Pattern.M);
+ * RegExp r2 = RegExp.compile("(?im)abc", 0);
+ * </pre>
+ *
+ * The flags are duplicated so that the familiar Perl match flag
+ * names are available.
+ */
+
+ /**
+ * Enables Unix lines mode.
+ *
+ * <p> In this mode, only the <tt>'\n'</tt> line terminator is recognized
+ * in the behavior of <tt>.</tt>, <tt>^</tt>, and <tt>$</tt>.
+ *
+ * <p> Unix lines mode can also be enabled via the embedded flag
+ * expression <tt>(?d)</tt>.
+ */
+ public static final int UNIX_LINES = 0x01;
+
+ /**
+ * Enables case-insensitive matching.
+ *
+ * <p> By default, case-insensitive matching assumes that only characters
+ * in the US-ASCII charset are being matched. Unicode-aware
+ * case-insensitive matching can be enabled by specifying the {@link
+ * #UNICODE_CASE} flag in conjunction with this flag.
+ *
+ * <p> Case-insensitive matching can also be enabled via the embedded flag
+ * expression <tt>(?i)</tt>.
+ *
+ * <p> Specifying this flag may impose a slight performance penalty. </p>
+ */
+ public static final int CASE_INSENSITIVE = 0x02;
+
+ /**
+ * Permits whitespaces in pattern.
+ *
+ */
+ public static final int IGNORE_WHITESPACE = 0x04;
+
+ /**
+ * Enables multiline mode.
+ *
+ * <p> In multiline mode the expressions <tt>^</tt> and <tt>$</tt> match
+ * just after or just before, respectively, a line terminator or the end of
+ * the input sequence. By default these expressions only match at the
+ * beginning and the end of the entire input sequence.
+ *
+ * <p> Multiline mode can also be enabled via the embedded flag
+ * expression <tt>(?m)</tt>. </p>
+ */
+ public static final int MULTILINE = 0x08;
+
+ /**
+ * Enables literal parsing of the pattern.
+ *
+ * <p> When this flag is specified then the input string that specifies
+ * the pattern is treated as a sequence of literal characters.
+ * Metacharacters or escape sequences in the input sequence will be
+ * given no special meaning.
+ *
+ * <p>The flags CASE_INSENSITIVE and UNICODE_CASE retain their impact on
+ * matching when used in conjunction with this flag. The other flags
+ * become superfluous.
+ *
+ * <p> There is no embedded flag character for enabling literal parsing.
+ * @since 1.5
+ */
+ public static final int LITERAL = 0x10;
+
+ /**
+ * Enables dotall mode.
+ *
+ * <p> In dotall mode, the expression <tt>.</tt> matches any character,
+ * including a line terminator. By default this expression does not match
+ * line terminators.
+ *
+ * <p> Dotall mode can also be enabled via the embedded flag
+ * expression <tt>(?s)</tt>. (The <tt>s</tt> is a mnemonic for
+ * "single-line" mode, which is what this is called in Perl.) </p>
+ */
+ public static final int DOTALL = 0x20;
+
+ /**
+ * Enables Unicode-aware case folding.
+ *
+ * <p> When this flag is specified then case-insensitive matching, when
+ * enabled by the {@link #CASE_INSENSITIVE} flag, is done in a manner
+ * consistent with the Unicode Standard. By default, case-insensitive
+ * matching assumes that only characters in the US-ASCII charset are being
+ * matched.
+ *
+ * <p> Unicode-aware case folding can also be enabled via the embedded flag
+ * expression <tt>(?u)</tt>.
+ *
+ * <p> Specifying this flag may impose a performance penalty. </p>
+ */
+ public static final int UNICODE_CASE = 0x40;
+
+ /**
+ * Enables canonical equivalence.
+ *
+ * <p> When this flag is specified then two characters will be considered
+ * to match if, and only if, their full canonical decompositions match.
+ * The expression <tt>"a\u030A"</tt>, for example, will match the
+ * string <tt>"\u00E5"</tt> when this flag is specified. By default,
+ * matching does not take canonical equivalence into account.
+ *
+ * <p> There is no embedded flag character for enabling canonical
+ * equivalence.
+ *
+ * <p> Specifying this flag may impose a performance penalty. </p>
+ */
+ public static final int CANON_EQ = 0x80;
+
+ /**
+ * Enables the Unicode version of <i>Predefined character classes</i> and
+ * <i>POSIX character classes</i>.
+ *
+ * <p> When this flag is specified then the (US-ASCII only)
+ * <i>Predefined character classes</i> and <i>POSIX character classes</i>
+ * are in conformance with
+ * <a href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical
+ * Standard #18: Unicode Regular Expression</i></a>
+ * <i>Annex C: Compatibility Properties</i>.
+ * <p>
+ * The UNICODE_CHARACTER_CLASS mode can also be enabled via the embedded
+ * flag expression <tt>(?U)</tt>.
+ * <p>
+ * The flag implies UNICODE_CASE, that is, it enables Unicode-aware case
+ * folding.
+ * <p>
+ * Specifying this flag may impose a performance penalty. </p>
+ * @since 1.7
+ */
+ public static final int UNICODE_CHARACTER_CLASS = 0x100;
+
+ /* Pattern has only two serialized components: The pattern string
+ * and the flags, which are all that is needed to recompile the pattern
+ * when it is deserialized.
+ */
+
+ /** use serialVersionUID from Merlin b59 for interoperability */
+ private static final long serialVersionUID = 5073258162644648461L;
+
+ /**
+ * The original regular-expression pattern string.
+ *
+ * @serial
+ */
+ private String pattern;
+
+ /**
+ * The original pattern flags.
+ *
+ * @serial
+ */
+ private int flags;
+
+ /**
+ * Boolean indicating this Pattern is compiled; this is necessary in order
+ * to lazily compile deserialized Patterns.
+ */
+ private transient volatile boolean compiled = false;
+
+ /**
+ * The normalized pattern string.
+ */
+ private transient String normalizedPattern;
+
+ /**
+ * The starting point of state machine for the find operation. This allows
+ * a match to start anywhere in the input.
+ */
+ transient Node root;
+
+ /**
+ * The root of object tree for a match operation. The pattern is matched
+ * at the beginning. This may include a find that uses BnM or a First
+ * node.
+ */
+ transient Node matchRoot;
+
+ /**
+ * Temporary storage used by parsing pattern slice.
+ */
+ transient int[] buffer;
+
+ /**
+ * Map the "name" of the "named capturing group" to its group id
+ * node.
+ */
+ transient volatile Map<String, Integer> namedGroups;
+
+ /**
+ * Temporary storage used while parsing group references.
+ */
+ transient GroupHead[] groupNodes;
+
+ /**
+ * Temporary null terminated code point array used by pattern compiling.
+ */
+ private transient int[] temp;
+
+ /**
+ * The number of capturing groups in this Pattern. Used by matchers to
+ * allocate storage needed to perform a match.
+ */
+ transient int capturingGroupCount;
+
+ /**
+ * The local variable count used by parsing tree. Used by matchers to
+ * allocate storage needed to perform a match.
+ */
+ transient int localCount;
+
+ /**
+ * Index into the pattern string that keeps track of how much has been
+ * parsed.
+ */
+ private transient int cursor;
+
+ /**
+ * Holds the length of the pattern string.
+ */
+ private transient int patternLength;
+
+ /**
+ * If the Start node might possibly match supplementary characters.
+ * It is set to true during compiling if
+ * (1) There is supplementary char in pattern, or
+ * (2) There is complement node of Category or Block
+ */
+ private transient boolean hasSupplementary;
+
+ /**
+ * Compiles the given regular expression into a pattern.
+ *
+ * @param regex
+ * The expression to be compiled
+ * @return the given regular expression compiled into a pattern
+ * @throws PatternSyntaxException
+ * If the expression's syntax is invalid
+ */
+ public static Pattern compile(String regex) {
+ return new Pattern(regex, 0);
+ }
+
+ /**
+ * Compiles the given regular expression into a pattern with the given
+ * flags.
+ *
+ * @param regex
+ * The expression to be compiled
+ *
+ * @param flags
+ * Match flags, a bit mask that may include
+ * {@link #CASE_INSENSITIVE}, {@link #MULTILINE}, {@link #DOTALL},
+ * {@link #UNICODE_CASE}, {@link #CANON_EQ}, {@link #UNIX_LINES},
+ * {@link #LITERAL}, {@link #UNICODE_CHARACTER_CLASS}
+ * and {@link #IGNORE_WHITESPACE}
+ *
+ * @return the given regular expression compiled into a pattern with the given flags
+ * @throws IllegalArgumentException
+ * If bit values other than those corresponding to the defined
+ * match flags are set in <tt>flags</tt>
+ *
+ * @throws PatternSyntaxException
+ * If the expression's syntax is invalid
+ */
+ public static Pattern compile(String regex, int flags) {
+ return new Pattern(regex, flags);
+ }
+
+ /**
+ * Returns the regular expression from which this pattern was compiled.
+ *
+ * @return The source of this pattern
+ */
+ public String pattern() {
+ return pattern;
+ }
+
+ /**
+ * <p>Returns the string representation of this pattern. This
+ * is the regular expression from which this pattern was
+ * compiled.</p>
+ *
+ * @return The string representation of this pattern
+ * @since 1.5
+ */
+ public String toString() {
+ return pattern;
+ }
+
+ /**
+ * Creates a matcher that will match the given input against this pattern.
+ *
+ * @param input
+ * The character sequence to be matched
+ *
+ * @return A new matcher for this pattern
+ */
+ public Matcher matcher(CharSequence input) {
+ if (!compiled) {
+ synchronized(this) {
+ if (!compiled)
+ compile();
+ }
+ }
+ Matcher m = new Matcher(this, input);
+ return m;
+ }
+
+ /**
+ * Returns this pattern's match flags.
+ *
+ * @return The match flags specified when this pattern was compiled
+ */
+ public int flags() {
+ return flags;
+ }
+
+ /**
+ * Compiles the given regular expression and attempts to match the given
+ * input against it.
+ *
+ * <p> An invocation of this convenience method of the form
+ *
+ * <blockquote><pre>
+ * Pattern.matches(regex, input);</pre></blockquote>
+ *
+ * behaves in exactly the same way as the expression
+ *
+ * <blockquote><pre>
+ * Pattern.compile(regex).matcher(input).matches()</pre></blockquote>
+ *
+ * <p> If a pattern is to be used multiple times, compiling it once and reusing
+ * it will be more efficient than invoking this method each time. </p>
+ *
+ * @param regex
+ * The expression to be compiled
+ *
+ * @param input
+ * The character sequence to be matched
+ * @return whether or not the regular expression matches on the input
+ * @throws PatternSyntaxException
+ * If the expression's syntax is invalid
+ */
+ public static boolean matches(String regex, CharSequence input) {
+ Pattern p = Pattern.compile(regex);
+ Matcher m = p.matcher(input);
+ return m.matches();
+ }
+
+ /**
+ * Splits the given input sequence around matches of this pattern.
+ *
+ * <p> The array returned by this method contains each substring of the
+ * input sequence that is terminated by another subsequence that matches
+ * this pattern or is terminated by the end of the input sequence. The
+ * substrings in the array are in the order in which they occur in the
+ * input. If this pattern does not match any subsequence of the input then
+ * the resulting array has just one element, namely the input sequence in
+ * string form.
+ *
+ * <p> When there is a positive-width match at the beginning of the input
+ * sequence then an empty leading substring is included at the beginning
+ * of the resulting array. A zero-width match at the beginning however
+ * never produces such empty leading substring.
+ *
+ * <p> The <tt>limit</tt> parameter controls the number of times the
+ * pattern is applied and therefore affects the length of the resulting
+ * array. If the limit <i>n</i> is greater than zero then the pattern
+ * will be applied at most <i>n</i> - 1 times, the array's
+ * length will be no greater than <i>n</i>, and the array's last entry
+ * will contain all input beyond the last matched delimiter. If <i>n</i>
+ * is non-positive then the pattern will be applied as many times as
+ * possible and the array can have any length. If <i>n</i> is zero then
+ * the pattern will be applied as many times as possible, the array can
+ * have any length, and trailing empty strings will be discarded.
+ *
+ * <p> The input <tt>"boo:and:foo"</tt>, for example, yields the following
+ * results with these parameters:
+ *
+ * <blockquote><table cellpadding=1 cellspacing=0
+ * summary="Split examples showing regex, limit, and result">
+ * <tr><th align="left"><i>Regex </i></th>
+ * <th align="left"><i>Limit </i></th>
+ * <th align="left"><i>Result </i></th></tr>
+ * <tr><td align=center>:</td>
+ * <td align=center>2</td>
+ * <td><tt>{ "boo", "and:foo" }</tt></td></tr>
+ * <tr><td align=center>:</td>
+ * <td align=center>5</td>
+ * <td><tt>{ "boo", "and", "foo" }</tt></td></tr>
+ * <tr><td align=center>:</td>
+ * <td align=center>-2</td>
+ * <td><tt>{ "boo", "and", "foo" }</tt></td></tr>
+ * <tr><td align=center>o</td>
+ * <td align=center>5</td>
+ * <td><tt>{ "b", "", ":and:f", "", "" }</tt></td></tr>
+ * <tr><td align=center>o</td>
+ * <td align=center>-2</td>
+ * <td><tt>{ "b", "", ":and:f", "", "" }</tt></td></tr>
+ * <tr><td align=center>o</td>
+ * <td align=center>0</td>
+ * <td><tt>{ "b", "", ":and:f" }</tt></td></tr>
+ * </table></blockquote>
+ *
+ * @param input
+ * The character sequence to be split
+ *
+ * @param limit
+ * The result threshold, as described above
+ *
+ * @return The array of strings computed by splitting the input
+ * around matches of this pattern
+ */
+ public String[] split(CharSequence input, int limit) {
+ int index = 0;
+ boolean matchLimited = limit > 0;
+ ArrayList<String> matchList = new ArrayList<>();
+ Matcher m = matcher(input);
+
+ // Add segments before each match found
+ while(m.find()) {
+ if (!matchLimited || matchList.size() < limit - 1) {
+ if (index == 0 && index == m.start() && m.start() == m.end()) {
+ // no empty leading substring included for zero-width match
+ // at the beginning of the input char sequence.
+ continue;
+ }
+ String match = input.subSequence(index, m.start()).toString();
+ matchList.add(match);
+ index = m.end();
+ } else if (matchList.size() == limit - 1) { // last one
+ String match = input.subSequence(index,
+ input.length()).toString();
+ matchList.add(match);
+ index = m.end();
+ }
+ }
+
+ // If no match was found, return this
+ if (index == 0)
+ return new String[] {input.toString()};
+
+ // Add remaining segment
+ if (!matchLimited || matchList.size() < limit)
+ matchList.add(input.subSequence(index, input.length()).toString());
+
+ // Construct result
+ int resultSize = matchList.size();
+ if (limit == 0)
+ while (resultSize > 0 && matchList.get(resultSize-1).equals(""))
+ resultSize--;
+ String[] result = new String[resultSize];
+ return matchList.subList(0, resultSize).toArray(result);
+ }
+
+ /**
+ * Splits the given input sequence around matches of this pattern.
+ *
+ * <p> This method works as if by invoking the two-argument {@link
+ * #split(java.lang.CharSequence, int) split} method with the given input
+ * sequence and a limit argument of zero. Trailing empty strings are
+ * therefore not included in the resulting array. </p>
+ *
+ * <p> The input <tt>"boo:and:foo"</tt>, for example, yields the following
+ * results with these expressions:
+ *
+ * <blockquote><table cellpadding=1 cellspacing=0
+ * summary="Split examples showing regex and result">
+ * <tr><th align="left"><i>Regex </i></th>
+ * <th align="left"><i>Result</i></th></tr>
+ * <tr><td align=center>:</td>
+ * <td><tt>{ "boo", "and", "foo" }</tt></td></tr>
+ * <tr><td align=center>o</td>
+ * <td><tt>{ "b", "", ":and:f" }</tt></td></tr>
+ * </table></blockquote>
+ *
+ *
+ * @param input
+ * The character sequence to be split
+ *
+ * @return The array of strings computed by splitting the input
+ * around matches of this pattern
+ */
+ public String[] split(CharSequence input) {
+ return split(input, 0);
+ }
+
+ /**
+ * Returns a literal pattern <code>String</code> for the specified
+ * <code>String</code>.
+ *
+ * <p>This method produces a <code>String</code> that can be used to
+ * create a <code>Pattern</code> that would match the string
+ * <code>s</code> as if it were a literal pattern.</p> Metacharacters
+ * or escape sequences in the input sequence will be given no special
+ * meaning.
+ *
+ * @param s The string to be literalized
+ * @return A literal string replacement
+ * @since 1.5
+ */
+ public static String quote(String s) {
+ int slashEIndex = s.indexOf("\\E");
+ if (slashEIndex == -1)
+ return "\\Q" + s + "\\E";
+
+ StringBuilder sb = new StringBuilder(s.length() * 2);
+ sb.append("\\Q");
+ slashEIndex = 0;
+ int current = 0;
+ while ((slashEIndex = s.indexOf("\\E", current)) != -1) {
+ sb.append(s.substring(current, slashEIndex));
+ current = slashEIndex + 2;
+ sb.append("\\E\\\\E\\Q");
+ }
+ sb.append(s.substring(current, s.length()));
+ sb.append("\\E");
+ return sb.toString();
+ }
+
+ /**
+ * Recompile the Pattern instance from a stream. The original pattern
+ * string is read in and the object tree is recompiled from it.
+ */
+ private void readObject(java.io.ObjectInputStream s)
+ throws java.io.IOException, ClassNotFoundException {
+
+ // Read in all fields
+ s.defaultReadObject();
+
+ // Initialize counts
+ capturingGroupCount = 1;
+ localCount = 0;
+
+ // if length > 0, the Pattern is lazily compiled
+ compiled = false;
+ if (pattern.length() == 0) {
+ root = new Start(lastAccept);
+ matchRoot = lastAccept;
+ compiled = true;
+ }
+ }
+
+ /**
+ * This private constructor is used to create all Patterns. The pattern
+ * string and match flags are all that is needed to completely describe
+ * a Pattern. An empty pattern string results in an object tree with
+ * only a Start node and a LastNode node.
+ */
+ private Pattern(String p, int f) {
+ pattern = p;
+ flags = f;
+
+ // to use UNICODE_CASE if UNICODE_CHARACTER_CLASS present
+ if ((flags & UNICODE_CHARACTER_CLASS) != 0)
+ flags |= UNICODE_CASE;
+
+ // Reset group index count
+ capturingGroupCount = 1;
+ localCount = 0;
+
+ if (pattern.length() > 0) {
+ try {
+ compile();
+ } catch (StackOverflowError soe) {
+ throw error("Stack overflow during pattern compilation");
+ }
+ } else {
+ root = new Start(lastAccept);
+ matchRoot = lastAccept;
+ }
+ }
+
+ /**
+ * The pattern is converted to normalizedD form and then a pure group
+ * is constructed to match canonical equivalences of the characters.
+ */
+ private void normalize() {
+ boolean inCharClass = false;
+ int lastCodePoint = -1;
+
+ // Convert pattern into normalizedD form
+ normalizedPattern = Normalizer.normalize(pattern, Normalizer.Form.NFD);
+ patternLength = normalizedPattern.length();
+
+ // Modify pattern to match canonical equivalences
+ StringBuilder newPattern = new StringBuilder(patternLength);
+ for(int i=0; i<patternLength; ) {
+ int c = normalizedPattern.codePointAt(i);
+ StringBuilder sequenceBuffer;
+ if ((Character.getType(c) == Character.NON_SPACING_MARK)
+ && (lastCodePoint != -1)) {
+ sequenceBuffer = new StringBuilder();
+ sequenceBuffer.appendCodePoint(lastCodePoint);
+ sequenceBuffer.appendCodePoint(c);
+ while(Character.getType(c) == Character.NON_SPACING_MARK) {
+ i += Character.charCount(c);
+ if (i >= patternLength)
+ break;
+ c = normalizedPattern.codePointAt(i);
+ sequenceBuffer.appendCodePoint(c);
+ }
+ String ea = produceEquivalentAlternation(
+ sequenceBuffer.toString());
+ newPattern.setLength(newPattern.length()-Character.charCount(lastCodePoint));
+ newPattern.append("(?:").append(ea).append(")");
+ } else if (c == '[' && lastCodePoint != '\\') {
+ i = normalizeCharClass(newPattern, i);
+ } else {
+ newPattern.appendCodePoint(c);
+ }
+ lastCodePoint = c;
+ i += Character.charCount(c);
+ }
+ normalizedPattern = newPattern.toString();
+ }
+
+ /**
+ * Complete the character class being parsed and add a set
+ * of alternations to it that will match the canonical equivalences
+ * of the characters within the class.
+ */
+ private int normalizeCharClass(StringBuilder newPattern, int i) {
+ StringBuilder charClass = new StringBuilder();
+ StringBuilder eq = null;
+ int lastCodePoint = -1;
+ String result;
+
+ i++;
+ if (i == normalizedPattern.length())
+ throw error("Unclosed character class");
+ charClass.append("[");
+ while(true) {
+ int c = normalizedPattern.codePointAt(i);
+ StringBuilder sequenceBuffer;
+
+ if (c == ']' && lastCodePoint != '\\') {
+ charClass.append((char)c);
+ break;
+ } else if (Character.getType(c) == Character.NON_SPACING_MARK) {
+ sequenceBuffer = new StringBuilder();
+ sequenceBuffer.appendCodePoint(lastCodePoint);
+ while(Character.getType(c) == Character.NON_SPACING_MARK) {
+ sequenceBuffer.appendCodePoint(c);
+ i += Character.charCount(c);
+ if (i >= normalizedPattern.length())
+ break;
+ c = normalizedPattern.codePointAt(i);
+ }
+ String ea = produceEquivalentAlternation(
+ sequenceBuffer.toString());
+
+ charClass.setLength(charClass.length()-Character.charCount(lastCodePoint));
+ if (eq == null)
+ eq = new StringBuilder();
+ eq.append('|');
+ eq.append(ea);
+ } else {
+ charClass.appendCodePoint(c);
+ i++;
+ }
+ if (i == normalizedPattern.length())
+ throw error("Unclosed character class");
+ lastCodePoint = c;
+ }
+
+ if (eq != null) {
+ result = "(?:"+charClass.toString()+eq.toString()+")";
+ } else {
+ result = charClass.toString();
+ }
+
+ newPattern.append(result);
+ return i;
+ }
+
+ /**
+ * Given a specific sequence composed of a regular character and
+ * combining marks that follow it, produce the alternation that will
+ * match all canonical equivalences of that sequence.
+ */
+ private String produceEquivalentAlternation(String source) {
+ int len = countChars(source, 0, 1);
+ if (source.length() == len)
+ // source has one character.
+ return source;
+
+ String base = source.substring(0,len);
+ String combiningMarks = source.substring(len);
+
+ String[] perms = producePermutations(combiningMarks);
+ StringBuilder result = new StringBuilder(source);
+
+ // Add combined permutations
+ for(int x=0; x<perms.length; x++) {
+ String next = base + perms[x];
+ if (x>0)
+ result.append("|"+next);
+ next = composeOneStep(next);
+ if (next != null)
+ result.append("|"+produceEquivalentAlternation(next));
+ }
+ return result.toString();
+ }
+
+ /**
+ * Returns an array of strings that have all the possible
+ * permutations of the characters in the input string.
+ * This is used to get a list of all possible orderings
+ * of a set of combining marks. Note that some of the permutations
+ * are invalid because of combining class collisions, and these
+ * possibilities must be removed because they are not canonically
+ * equivalent.
+ */
+ private String[] producePermutations(String input) {
+ if (input.length() == countChars(input, 0, 1))
+ return new String[] {input};
+
+ if (input.length() == countChars(input, 0, 2)) {
+ int c0 = Character.codePointAt(input, 0);
+ int c1 = Character.codePointAt(input, Character.charCount(c0));
+ if (getClass(c1) == getClass(c0)) {
+ return new String[] {input};
+ }
+ String[] result = new String[2];
+ result[0] = input;
+ StringBuilder sb = new StringBuilder(2);
+ sb.appendCodePoint(c1);
+ sb.appendCodePoint(c0);
+ result[1] = sb.toString();
+ return result;
+ }
+
+ int length = 1;
+ int nCodePoints = countCodePoints(input);
+ for(int x=1; x<nCodePoints; x++)
+ length = length * (x+1);
+
+ String[] temp = new String[length];
+
+ int combClass[] = new int[nCodePoints];
+ for(int x=0, i=0; x<nCodePoints; x++) {
+ int c = Character.codePointAt(input, i);
+ combClass[x] = getClass(c);
+ i += Character.charCount(c);
+ }
+
+ // For each char, take it out and add the permutations
+ // of the remaining chars
+ int index = 0;
+ int len;
+ // offset maintains the index in code units.
+loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
+ len = countChars(input, offset, 1);
+ boolean skip = false;
+ for(int y=x-1; y>=0; y--) {
+ if (combClass[y] == combClass[x]) {
+ continue loop;
+ }
+ }
+ StringBuilder sb = new StringBuilder(input);
+ String otherChars = sb.delete(offset, offset+len).toString();
+ String[] subResult = producePermutations(otherChars);
+
+ String prefix = input.substring(offset, offset+len);
+ for(int y=0; y<subResult.length; y++)
+ temp[index++] = prefix + subResult[y];
+ }
+ String[] result = new String[index];
+ for (int x=0; x<index; x++)
+ result[x] = temp[x];
+ return result;
+ }
+
+ private int getClass(int c) {
+ return sun.text.Normalizer.getCombiningClass(c);
+ }
+
+ /**
+ * Attempts to compose input by combining the first character
+ * with the first combining mark following it. Returns a String
+ * that is the composition of the leading character with its first
+ * combining mark followed by the remaining combining marks. Returns
+ * null if the first two characters cannot be further composed.
+ */
+ private String composeOneStep(String input) {
+ int len = countChars(input, 0, 2);
+ String firstTwoCharacters = input.substring(0, len);
+ String result = Normalizer.normalize(firstTwoCharacters, Normalizer.Form.NFC);
+
+ if (result.equals(firstTwoCharacters))
+ return null;
+ else {
+ String remainder = input.substring(len);
+ return result + remainder;
+ }
+ }
+
+ /**
+ * Preprocess any \Q...\E sequences in `temp', meta-quoting them.
+ * See the description of `quotemeta' in perlfunc(1).
+ */
+ private void RemoveQEQuoting() {
+ final int pLen = patternLength;
+ int i = 0;
+ while (i < pLen-1) {
+ if (temp[i] != '\\')
+ i += 1;
+ else if (temp[i + 1] != 'Q')
+ i += 2;
+ else
+ break;
+ }
+ if (i >= pLen - 1) // No \Q sequence found
+ return;
+ int j = i;
+ i += 2;
+ int[] newtemp = new int[j + 3*(pLen-i) + 2];
+ System.arraycopy(temp, 0, newtemp, 0, j);
+
+ boolean inQuote = true;
+ boolean beginQuote = true;
+ while (i < pLen) {
+ int c = temp[i++];
+ if (!ASCII.isAscii(c) || ASCII.isAlpha(c)) {
+ newtemp[j++] = c;
+ } else if (ASCII.isDigit(c)) {
+ if (beginQuote) {
+ /*
+ * A unicode escape \[0xu] could be before this quote,
+ * and we don't want this numeric char to processed as
+ * part of the escape.
+ */
+ newtemp[j++] = '\\';
+ newtemp[j++] = 'x';
+ newtemp[j++] = '3';
+ }
+ newtemp[j++] = c;
+ } else if (c != '\\') {
+ if (inQuote) newtemp[j++] = '\\';
+ newtemp[j++] = c;
+ } else if (inQuote) {
+ if (temp[i] == 'E') {
+ i++;
+ inQuote = false;
+ } else {
+ newtemp[j++] = '\\';
+ newtemp[j++] = '\\';
+ }
+ } else {
+ if (temp[i] == 'Q') {
+ i++;
+ inQuote = true;
+ beginQuote = true;
+ continue;
+ } else {
+ newtemp[j++] = c;
+ if (i != pLen)
+ newtemp[j++] = temp[i++];
+ }
+ }
+
+ beginQuote = false;
+ }
+
+ patternLength = j;
+ temp = Arrays.copyOf(newtemp, j + 2); // double zero termination
+ }
+
+ /**
+ * Copies regular expression to an int array and invokes the parsing
+ * of the expression which will create the object tree.
+ */
+ private void compile() {
+ // Handle canonical equivalences
+ if (has(CANON_EQ) && !has(LITERAL)) {
+ normalize();
+ } else {
+ normalizedPattern = pattern;
+ }
+ patternLength = normalizedPattern.length();
+
+ // Copy pattern to int array for convenience
+ // Use double zero to terminate pattern
+ temp = new int[patternLength + 2];
+
+ hasSupplementary = false;
+ int c, count = 0;
+ // Convert all chars into code points
+ for (int x = 0; x < patternLength; x += Character.charCount(c)) {
+ c = normalizedPattern.codePointAt(x);
+ if (isSupplementary(c)) {
+ hasSupplementary = true;
+ }
+ temp[count++] = c;
+ }
+
+ patternLength = count; // patternLength now in code points
+
+ if (! has(LITERAL))
+ RemoveQEQuoting();
+
+ // Allocate all temporary objects here.
+ buffer = new int[32];
+ groupNodes = new GroupHead[10];
+ namedGroups = null;
+
+ if (has(LITERAL)) {
+ // Literal pattern handling
+ matchRoot = newSlice(temp, patternLength, hasSupplementary);
+ matchRoot.next = lastAccept;
+ } else {
+ // Start recursive descent parsing
+ matchRoot = expr(lastAccept);
+ // Check extra pattern characters
+ if (patternLength != cursor) {
+ if (peek() == ')') {
+ throw error("Unmatched closing ')'");
+ } else {
+ throw error("Unexpected internal error");
+ }
+ }
+ }
+
+ // Peephole optimization
+ if (matchRoot instanceof Slice) {
+ root = BnM.optimize(matchRoot);
+ if (root == matchRoot) {
+ root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot);
+ }
+ } else if (matchRoot instanceof Begin || matchRoot instanceof First) {
+ root = matchRoot;
+ } else {
+ root = hasSupplementary ? new StartS(matchRoot) : new Start(matchRoot);
+ }
+
+ // Release temporary storage
+ temp = null;
+ buffer = null;
+ groupNodes = null;
+ patternLength = 0;
+ compiled = true;
+ }
+
+ Map<String, Integer> namedGroups() {
+ if (namedGroups == null)
+ namedGroups = new HashMap<>(2);
+ return namedGroups;
+ }
+
+ /**
+ * Used to print out a subtree of the Pattern to help with debugging.
+ */
+ private static void printObjectTree(Node node) {
+ while(node != null) {
+ if (node instanceof Prolog) {
+ System.out.println(node);
+ printObjectTree(((Prolog)node).loop);
+ System.out.println("**** end contents prolog loop");
+ } else if (node instanceof Loop) {
+ System.out.println(node);
+ printObjectTree(((Loop)node).body);
+ System.out.println("**** end contents Loop body");
+ } else if (node instanceof Curly) {
+ System.out.println(node);
+ printObjectTree(((Curly)node).atom);
+ System.out.println("**** end contents Curly body");
+ } else if (node instanceof GroupCurly) {
+ System.out.println(node);
+ printObjectTree(((GroupCurly)node).atom);
+ System.out.println("**** end contents GroupCurly body");
+ } else if (node instanceof GroupTail) {
+ System.out.println(node);
+ System.out.println("Tail next is "+node.next);
+ return;
+ } else {
+ System.out.println(node);
+ }
+ node = node.next;
+ if (node != null)
+ System.out.println("->next:");
+ if (node == Pattern.accept) {
+ System.out.println("Accept Node");
+ node = null;
+ }
+ }
+ }
+
+ /**
+ * Used to accumulate information about a subtree of the object graph
+ * so that optimizations can be applied to the subtree.
+ */
+ static final class TreeInfo {
+ int minLength;
+ int maxLength;
+ boolean maxValid;
+ boolean deterministic;
+
+ TreeInfo() {
+ reset();
+ }
+ void reset() {
+ minLength = 0;
+ maxLength = 0;
+ maxValid = true;
+ deterministic = true;
+ }
+ }
+
+ /*
+ * The following private methods are mainly used to improve the
+ * readability of the code. In order to let the Java compiler easily
+ * inline them, we should not put many assertions or error checks in them.
+ */
+
+ /**
+ * Indicates whether a particular flag is set or not.
+ */
+ private boolean has(int f) {
+ return (flags & f) != 0;
+ }
+
+ /**
+ * Match next character, signal error if failed.
+ */
+ private void accept(int ch, String s) {
+ int testChar = temp[cursor++];
+ if (has(IGNORE_WHITESPACE))
+ testChar = parsePastWhitespace(testChar);
+ if (ch != testChar) {
+ throw error(s);
+ }
+ }
+
+ /**
+ * Mark the end of pattern with a specific character.
+ */
+ private void mark(int c) {
+ temp[patternLength] = c;
+ }
+
+ /**
+ * Peek the next character, and do not advance the cursor.
+ */
+ private int peek() {
+ int ch = temp[cursor];
+ if (has(IGNORE_WHITESPACE))
+ ch = peekPastWhitespace(ch);
+ return ch;
+ }
+
+ /**
+ * Read the next character, and advance the cursor by one.
+ */
+ private int read() {
+ int ch = temp[cursor++];
+ if (has(IGNORE_WHITESPACE))
+ ch = parsePastWhitespace(ch);
+ return ch;
+ }
+
+ /**
+ * Read the next character, and advance the cursor by one,
+ * ignoring the COMMENTS setting
+ */
+ private int readEscaped() {
+ int ch = temp[cursor++];
+ return ch;
+ }
+
+ /**
+ * Advance the cursor by one, and peek the next character.
+ */
+ private int next() {
+ int ch = temp[++cursor];
+ if (has(IGNORE_WHITESPACE))
+ ch = peekPastWhitespace(ch);
+ return ch;
+ }
+
+ /**
+ * Advance the cursor by one, and peek the next character,
+ * ignoring the COMMENTS setting
+ */
+ private int nextEscaped() {
+ int ch = temp[++cursor];
+ return ch;
+ }
+
+ /**
+ * If in xmode peek past whitespace.
+ */
+ private int peekPastWhitespace(int ch) {
+ while (ASCII.isSpace(ch))
+ ch = temp[++cursor];
+
+ return ch;
+ }
+
+ /**
+ * If in xmode parse past whitespace.
+ */
+ private int parsePastWhitespace(int ch) {
+ while (ASCII.isSpace(ch))
+ ch = temp[cursor++];
+
+ return ch;
+ }
+
+ /**
+ * xmode parse past comment to end of line.
+ */
+ private int parsePastLine() {
+ int ch = temp[cursor++];
+ while (ch != 0 && !isLineSeparator(ch))
+ ch = temp[cursor++];
+ if (ch == 0 && cursor > patternLength) {
+ cursor = patternLength;
+ ch = temp[cursor++];
+ }
+ return ch;
+ }
+
+ /**
+ * xmode peek past comment to end of line.
+ */
+ private int peekPastLine() {
+ int ch = temp[++cursor];
+ while (ch != 0 && !isLineSeparator(ch))
+ ch = temp[++cursor];
+ if (ch == 0 && cursor > patternLength) {
+ cursor = patternLength;
+ ch = temp[cursor];
+ }
+ return ch;
+ }
+
+ /**
+ * Determines if character is a line separator in the current mode
+ */
+ private boolean isLineSeparator(int ch) {
+ if (has(UNIX_LINES)) {
+ return ch == '\n';
+ } else {
+ return (ch == '\n' ||
+ ch == '\r' ||
+ (ch|1) == '\u2029' ||
+ ch == '\u0085');
+ }
+ }
+
+ /**
+ * Read the character after the next one, and advance the cursor by two.
+ */
+ private int skip() {
+ int i = cursor;
+ int ch = temp[i+1];
+ cursor = i + 2;
+ return ch;
+ }
+
+ /**
+ * Unread one next character, and retreat cursor by one.
+ */
+ private void unread() {
+ cursor--;
+ }
+
+ /**
+ * Internal method used for handling all syntax errors. The pattern is
+ * displayed with a pointer to aid in locating the syntax error.
+ */
+ private PatternSyntaxException error(String s) {
+ return new PatternSyntaxException(s, normalizedPattern, cursor - 1);
+ }
+
+ /**
+ * Determines if there is any supplementary character or unpaired
+ * surrogate in the specified range.
+ */
+ private boolean findSupplementary(int start, int end) {
+ for (int i = start; i < end; i++) {
+ if (isSupplementary(temp[i]))
+ return true;
+ }
+ return false;
+ }
+
+ /**
+ * Determines if the specified code point is a supplementary
+ * character or unpaired surrogate.
+ */
+ private static final boolean isSupplementary(int ch) {
+ return ch >= Character.MIN_SUPPLEMENTARY_CODE_POINT ||
+ Character.isSurrogate((char)ch);
+ }
+
+ /**
+ * The following methods handle the main parsing. They are sorted
+ * according to their precedence order, the lowest one first.
+ */
+
+ /**
+ * The expression is parsed with branch nodes added for alternations.
+ * This may be called recursively to parse sub expressions that may
+ * contain alternations.
+ */
+ private Node expr(Node end) {
+ Node prev = null;
+ Node firstTail = null;
+ Branch branch = null;
+ Node branchConn = null;
+
+ for (;;) {
+ Node node = sequence(end);
+ Node nodeTail = root; //double return
+ if (prev == null) {
+ prev = node;
+ firstTail = nodeTail;
+ } else {
+ // Branch
+ if (branchConn == null) {
+ branchConn = new BranchConn();
+ branchConn.next = end;
+ }
+ if (node == end) {
+ // if the node returned from sequence() is "end"
+ // we have an empty expr, set a null atom into
+ // the branch to indicate to go "next" directly.
+ node = null;
+ } else {
+ // the "tail.next" of each atom goes to branchConn
+ nodeTail.next = branchConn;
+ }
+ if (prev == branch) {
+ branch.add(node);
+ } else {
+ if (prev == end) {
+ prev = null;
+ } else {
+ // replace the "end" with "branchConn" at its tail.next
+ // when put the "prev" into the branch as the first atom.
+ firstTail.next = branchConn;
+ }
+ prev = branch = new Branch(prev, node, branchConn);
+ }
+ }
+ if (peek() != '|') {
+ return prev;
+ }
+ next();
+ }
+ }
+
+ @SuppressWarnings("fallthrough")
+ /**
+ * Parsing of sequences between alternations.
+ */
+ private Node sequence(Node end) {
+ Node head = null;
+ Node tail = null;
+ Node node = null;
+ LOOP:
+ for (;;) {
+ int ch = peek();
+ switch (ch) {
+ case '(':
+ // Because group handles its own closure,
+ // we need to treat it differently
+ node = group0();
+ // Check for comment or flag group
+ if (node == null)
+ continue;
+ if (head == null)
+ head = node;
+ else
+ tail.next = node;
+ // Double return: Tail was returned in root
+ tail = root;
+ continue;
+ case '[':
+ node = clazz(true);
+ break;
+ case '\\':
+ ch = nextEscaped();
+ if (ch == 'p' || ch == 'P') {
+ boolean oneLetter = true;
+ boolean comp = (ch == 'P');
+ ch = next(); // Consume { if present
+ if (ch != '{') {
+ unread();
+ } else {
+ oneLetter = false;
+ }
+ node = family(oneLetter, comp);
+ } else {
+ unread();
+ node = atom();
+ }
+ break;
+ case '^':
+ next();
+ if (has(MULTILINE)) {
+ if (has(UNIX_LINES))
+ node = new UnixCaret();
+ else
+ node = new Caret();
+ } else {
+ node = new Begin();
+ }
+ break;
+ case '$':
+ next();
+ if (has(UNIX_LINES))
+ node = new UnixDollar(has(MULTILINE));
+ else
+ node = new Dollar(has(MULTILINE));
+ break;
+ case '.':
+ next();
+ if (has(DOTALL)) {
+ node = new All();
+ } else {
+ /*if (has(UNIX_LINES))
+ node = new UnixDot();
+ else {
+ node = new Dot();
+ }*/
+ // changes for XPath 3.1 regex compliance
+ node = new XPath2Dot();
+ }
+ break;
+ case '|':
+ case ')':
+ break LOOP;
+ case ']': // Now interpreting dangling ] and } as literals
+ case '}':
+ node = atom();
+ break;
+ case '?':
+ case '*':
+ case '+':
+ next();
+ throw error("Dangling meta character '" + ((char)ch) + "'");
+ case 0:
+ if (cursor >= patternLength) {
+ break LOOP;
+ }
+ // Fall through
+ default:
+ node = atom();
+ break;
+ }
+
+ node = closure(node);
+
+ if (head == null) {
+ head = tail = node;
+ } else {
+ tail.next = node;
+ tail = node;
+ }
+ }
+ if (head == null) {
+ return end;
+ }
+ tail.next = end;
+ root = tail; //double return
+ return head;
+ }
+
+ @SuppressWarnings("fallthrough")
+ /**
+ * Parse and add a new Single or Slice.
+ */
+ private Node atom() {
+ int first = 0;
+ int prev = -1;
+ boolean hasSupplementary = false;
+ int ch = peek();
+ for (;;) {
+ switch (ch) {
+ case '*':
+ case '+':
+ case '?':
+ case '{':
+ if (first > 1) {
+ cursor = prev; // Unwind one character
+ first--;
+ }
+ break;
+ case '$':
+ case '.':
+ case '^':
+ case '(':
+ case '[':
+ case '|':
+ case ')':
+ break;
+ case '\\':
+ ch = nextEscaped();
+ if (ch == 'p' || ch == 'P') { // Property
+ if (first > 0) { // Slice is waiting; handle it first
+ unread();
+ break;
+ } else { // No slice; just return the family node
+ boolean comp = (ch == 'P');
+ boolean oneLetter = true;
+ ch = next(); // Consume { if present
+ if (ch != '{')
+ unread();
+ else
+ oneLetter = false;
+ return family(oneLetter, comp);
+ }
+ }
+ unread();
+ prev = cursor;
+ ch = escape(false, first == 0, false);
+ if (ch >= 0) {
+ append(ch, first);
+ first++;
+ if (isSupplementary(ch)) {
+ hasSupplementary = true;
+ }
+ ch = peek();
+ continue;
+ } else if (first == 0) {
+ return root;
+ }
+ // Unwind meta escape sequence
+ cursor = prev;
+ break;
+ case 0:
+ if (cursor >= patternLength) {
+ break;
+ }
+ // Fall through
+ default:
+ prev = cursor;
+ append(ch, first);
+ first++;
+ if (isSupplementary(ch)) {
+ hasSupplementary = true;
+ }
+ ch = next();
+ continue;
+ }
+ break;
+ }
+ if (first == 1) {
+ return newSingle(buffer[0]);
+ } else {
+ return newSlice(buffer, first, hasSupplementary);
+ }
+ }
+
+ private void append(int ch, int len) {
+ if (len >= buffer.length) {
+ int[] tmp = new int[len+len];
+ System.arraycopy(buffer, 0, tmp, 0, len);
+ buffer = tmp;
+ }
+ buffer[len] = ch;
+ }
+
+ /**
+ * Parses a backref greedily, taking as many numbers as it
+ * can. The first digit is always treated as a backref, but
+ * multi digit numbers are only treated as a backref if at
+ * least that many backrefs exist at this point in the regex.
+ */
+ private Node ref(int refNum) {
+ boolean done = false;
+ while(!done) {
+ int ch = peek();
+ switch(ch) {
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9':
+ int newRefNum = (refNum * 10) + (ch - '0');
+ // Add another number if it doesn't make a group
+ // that doesn't exist
+ if (capturingGroupCount - 1 < newRefNum) {
+ done = true;
+ break;
+ }
+ refNum = newRefNum;
+ read();
+ break;
+ default:
+ done = true;
+ break;
+ }
+ }
+ if (has(CASE_INSENSITIVE))
+ return new CIBackRef(refNum, has(UNICODE_CASE));
+ else
+ return new BackRef(refNum);
+ }
+
+ /**
+ * Parses an escape sequence to determine the actual value that needs
+ * to be matched.
+ * If -1 is returned and create was true a new object was added to the tree
+ * to handle the escape sequence.
+ * If the returned value is greater than zero, it is the value that
+ * matches the escape sequence.
+ */
+ private int escape(boolean inclass, boolean create, boolean isrange) {
+ int ch = skip();
+ switch (ch) {
+ case '0':
+ // changes for XPath 3.1 regex compliance
+ break;
+ // return o();
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9':
+ if (inclass) break;
+ if (create) {
+ root = ref((ch - '0'));
+ }
+ return -1;
+ case 'A':
+ if (inclass) break;
+ if (create) root = new Begin();
+ return -1;
+ case 'B':
+ if (inclass) break;
+ if (create) root = new Bound(Bound.NONE, has(UNICODE_CHARACTER_CLASS));
+ return -1;
+ case 'C':
+ // break;
+ // changes for XPath 3.1 regex compliance
+ if (create) root = new XMLNameChar().complement();
+ return -1;
+ case 'D':
+ if (create) root = has(UNICODE_CHARACTER_CLASS)
+ ? new Utype(UnicodeProp.DIGIT).complement()
+ : new Ctype(ASCII.DIGIT).complement();
+ return -1;
+ case 'E':
+ case 'F':
+ break;
+ case 'G':
+ if (inclass) break;
+ if (create) root = new LastMatch();
+ return -1;
+ case 'H':
+ if (create) root = new HorizWS().complement();
+ return -1;
+ case 'I':
+ // changes for XPath 3.1 regex compliance
+ if (create) root = new XMLNameStartChar().complement();
+ return -1;
+ case 'J':
+ case 'K':
+ case 'L':
+ case 'M':
+ case 'N':
+ case 'O':
+ case 'P':
+ case 'Q':
+ break;
+ case 'R':
+ if (inclass) break;
+ if (create) root = new LineEnding();
+ return -1;
+ case 'S':
+ /*if (create) root = has(UNICODE_CHARACTER_CLASS)
+ ? new Utype(UnicodeProp.WHITE_SPACE).complement()
+ : new Ctype(ASCII.SPACE).complement();*/
+ // changes for XPath 3.1 regex compliance
+ if (create) root = new XPath2Whitespace().complement();
+ return -1;
+ case 'T':
+ case 'U':
+ break;
+ case 'V':
+ if (create) root = new VertWS().complement();
+ return -1;
+ case 'W':
+ if (create) root = has(UNICODE_CHARACTER_CLASS)
+ ? new Utype(UnicodeProp.WORD).complement()
+ : new Ctype(ASCII.WORD).complement();
+ return -1;
+ case 'X':
+ case 'Y':
+ break;
+ case 'Z':
+ if (inclass) break;
+ if (create) {
+ if (has(UNIX_LINES))
+ root = new UnixDollar(false);
+ else
+ root = new Dollar(false);
+ }
+ return -1;
+ case 'a':
+ // changes for XPath 3.1 regex compliance
+ break;
+ // return '\007';
+ case 'b':
+ if (inclass) break;
+ if (create) root = new Bound(Bound.BOTH, has(UNICODE_CHARACTER_CLASS));
+ return -1;
+ case 'c':
+ //return c();
+ // changes for XPath 3.1 regex compliance
+ if (create) root = new XMLNameChar();
+ return -1;
+ case 'd':
+ if (create) root = has(UNICODE_CHARACTER_CLASS)
+ ? new Utype(UnicodeProp.DIGIT)
+ : new Ctype(ASCII.DIGIT);
+ return -1;
+ case 'e':
+ // changes for XPath 3.1 regex compliance
+ break;
+ // return '\033';
+ case 'f':
+ // changes for XPath 3.1 regex compliance
+ break;
+ // return '\f';
+ case 'g':
+ break;
+ case 'h':
+ if (create) root = new HorizWS();
+ return -1;
+ case 'i':
+ // changes for XPath 3.1 regex compliance
+ if (create) root = new XMLNameStartChar();
+ return -1;
+ case 'j':
+ break;
+ case 'k':
+ if (inclass)
+ break;
+ if (read() != '<')
+ throw error("\\k is not followed by '<' for named capturing group");
+ String name = groupname(read());
+ if (!namedGroups().containsKey(name))
+ throw error("(named capturing group <"+ name+"> does not exit");
+ if (create) {
+ if (has(CASE_INSENSITIVE))
+ root = new CIBackRef(namedGroups().get(name), has(UNICODE_CASE));
+ else
+ root = new BackRef(namedGroups().get(name));
+ }
+ return -1;
+ case 'l':
+ case 'm':
+ break;
+ case 'n':
+ return '\n';
+ case 'o':
+ case 'p':
+ case 'q':
+ break;
+ case 'r':
+ return '\r';
+ case 's':
+ /*if (create) root = has(UNICODE_CHARACTER_CLASS)
+ ? new Utype(UnicodeProp.WHITE_SPACE)
+ : new Ctype(ASCII.SPACE);*/
+ // changes for XPath 3.1 regex compliance
+ if (create) root = new XPath2Whitespace();
+ return -1;
+ case 't':
+ return '\t';
+ case 'u':
+ // changes for XPath 3.1 regex compliance
+ break;
+ // return u();
+ case 'v':
+ // changes for XPath 3.1 regex compliance
+ break;
+ // '\v' was implemented as VT/0x0B in releases < 1.8 (though
+ // undocumented). In JDK8 '\v' is specified as a predefined
+ // character class for all vertical whitespace characters.
+ // So [-1, root=VertWS node] pair is returned (instead of a
+ // single 0x0B). This breaks the range if '\v' is used as
+ // the start or end value, such as [\v-...] or [...-\v], in
+ // which a single definite value (0x0B) is expected. For
+ // compatibility concern '\013'/0x0B is returned if isrange.
+ /*if (isrange)
+ return '\013';
+ if (create) root = new VertWS();
+ return -1;*/
+ case 'w':
+ if (create) root = has(UNICODE_CHARACTER_CLASS)
+ ? new Utype(UnicodeProp.WORD)
+ : new Ctype(ASCII.WORD);
+ return -1;
+ case 'x':
+ // changes for XPath 3.1 regex compliance
+ break;
+ // return x();
+ case 'y':
+ break;
+ case 'z':
+ if (inclass) break;
+ if (create) root = new End();
+ return -1;
+ default:
+ return ch;
+ }
+ throw error("Illegal/unsupported escape sequence");
+ }
+
+ /**
+ * Parse a character class, and return the node that matches it.
+ *
+ * Consumes a ] on the way out if consume is true. Usually consume
+ * is true except for the case of [abc&&def] where def is a separate
+ * right hand node with "understood" brackets.
+ */
+ private CharProperty clazz(boolean consume) {
+ CharProperty prev = null;
+ CharProperty node = null;
+ BitClass bits = new BitClass();
+ boolean include = true;
+ boolean firstInClass = true;
+ int ch = next();
+ for (;;) {
+ switch (ch) {
+ case '^':
+ // Negates if first char in a class, otherwise literal
+ if (firstInClass) {
+ if (temp[cursor-1] != '[')
+ break;
+ ch = next();
+ include = !include;
+ continue;
+ } else {
+ // ^ not first in class, treat as literal
+ break;
+ }
+ case '[':
+ firstInClass = false;
+ node = clazz(true);
+ if (prev == null)
+ prev = node;
+ else
+ prev = union(prev, node);
+ ch = peek();
+ continue;
+ case '&':
+ firstInClass = false;
+ ch = next();
+ if (ch == '&') {
+ ch = next();
+ CharProperty rightNode = null;
+ while (ch != ']' && ch != '&') {
+ if (ch == '[') {
+ if (rightNode == null)
+ rightNode = clazz(true);
+ else
+ rightNode = union(rightNode, clazz(true));
+ } else { // abc&&def
+ unread();
+ rightNode = clazz(false);
+ }
+ ch = peek();
+ }
+ if (rightNode != null)
+ node = rightNode;
+ if (prev == null) {
+ if (rightNode == null)
+ throw error("Bad class syntax");
+ else
+ prev = rightNode;
+ } else {
+ prev = intersection(prev, node);
+ }
+ } else {
+ // treat as a literal &
+ unread();
+ break;
+ }
+ continue;
+ case 0:
+ firstInClass = false;
+ if (cursor >= patternLength)
+ throw error("Unclosed character class");
+ break;
+ case ']':
+ firstInClass = false;
+ if (prev != null) {
+ if (consume)
+ next();
+ return prev;
+ }
+ break;
+ default:
+ firstInClass = false;
+ break;
+ }
+ node = range(bits);
+ if (include) {
+ if (prev == null) {
+ prev = node;
+ } else {
+ if (prev != node)
+ prev = union(prev, node);
+ }
+ } else {
+ if (prev == null) {
+ prev = node.complement();
+ } else {
+ if (prev != node)
+ prev = setDifference(prev, node);
+ }
+ }
+ ch = peek();
+ }
+ }
+
+ private CharProperty bitsOrSingle(BitClass bits, int ch) {
+ /* Bits can only handle codepoints in [u+0000-u+00ff] range.
+ Use "single" node instead of bits when dealing with unicode
+ case folding for codepoints listed below.
+ (1)Uppercase out of range: u+00ff, u+00b5
+ toUpperCase(u+00ff) -> u+0178
+ toUpperCase(u+00b5) -> u+039c
+ (2)LatinSmallLetterLongS u+17f
+ toUpperCase(u+017f) -> u+0053
+ (3)LatinSmallLetterDotlessI u+131
+ toUpperCase(u+0131) -> u+0049
+ (4)LatinCapitalLetterIWithDotAbove u+0130
+ toLowerCase(u+0130) -> u+0069
+ (5)KelvinSign u+212a
+ toLowerCase(u+212a) ==> u+006B
+ (6)AngstromSign u+212b
+ toLowerCase(u+212b) ==> u+00e5
+ */
+ int d;
+ if (ch < 256 &&
+ !(has(CASE_INSENSITIVE) && has(UNICODE_CASE) &&
+ (ch == 0xff || ch == 0xb5 ||
+ ch == 0x49 || ch == 0x69 || //I and i
+ ch == 0x53 || ch == 0x73 || //S and s
+ ch == 0x4b || ch == 0x6b || //K and k
+ ch == 0xc5 || ch == 0xe5))) //A+ring
+ return bits.add(ch, flags());
+ return newSingle(ch);
+ }
+
+ /**
+ * Parse a single character or a character range in a character class
+ * and return its representative node.
+ */
+ private CharProperty range(BitClass bits) {
+ int ch = peek();
+ if (ch == '\\') {
+ ch = nextEscaped();
+ if (ch == 'p' || ch == 'P') { // A property
+ boolean comp = (ch == 'P');
+ boolean oneLetter = true;
+ // Consume { if present
+ ch = next();
+ if (ch != '{')
+ unread();
+ else
+ oneLetter = false;
+ return family(oneLetter, comp);
+ } else { // ordinary escape
+ boolean isrange = temp[cursor+1] == '-';
+ unread();
+ ch = escape(true, true, isrange);
+ if (ch == -1)
+ return (CharProperty) root;
+ }
+ } else {
+ next();
+ }
+ if (ch >= 0) {
+ if (peek() == '-') {
+ int endRange = temp[cursor+1];
+ if (endRange == '[') {
+ return bitsOrSingle(bits, ch);
+ }
+ if (endRange != ']') {
+ next();
+ int m = peek();
+ if (m == '\\') {
+ m = escape(true, false, true);
+ } else {
+ next();
+ }
+ if (m < ch) {
+ throw error("Illegal character range");
+ }
+ if (has(CASE_INSENSITIVE))
+ return caseInsensitiveRangeFor(ch, m);
+ else
+ return rangeFor(ch, m);
+ }
+ }
+ return bitsOrSingle(bits, ch);
+ }
+ throw error("Unexpected character '"+((char)ch)+"'");
+ }
+
+ /**
+ * Parses a Unicode character family and returns its representative node.
+ */
+ private CharProperty family(boolean singleLetter,
+ boolean maybeComplement)
+ {
+ next();
+ String name;
+ CharProperty node = null;
+
+ if (singleLetter) {
+ int c = temp[cursor];
+ if (!Character.isSupplementaryCodePoint(c)) {
+ name = String.valueOf((char)c);
+ } else {
+ name = new String(temp, cursor, 1);
+ }
+ read();
+ } else {
+ int i = cursor;
+ mark('}');
+ while(read() != '}') {
+ }
+ mark('\000');
+ int j = cursor;
+ if (j > patternLength)
+ throw error("Unclosed character family");
+ if (i + 1 >= j)
+ throw error("Empty character family");
+ name = new String(temp, i, j-i-1);
+ }
+
+ /*int i = name.indexOf('=');
+ if (i != -1) {
+ // property construct \p{name=value}
+ String value = name.substring(i + 1);
+ name = name.substring(0, i).toLowerCase(Locale.ENGLISH);
+ if ("sc".equals(name) || "script".equals(name)) {
+ node = unicodeScriptPropertyFor(value);
+ } else if ("blk".equals(name) || "block".equals(name)) {
+ node = unicodeBlockPropertyFor(value);
+ } else if ("gc".equals(name) || "general_category".equals(name)) {
+ node = charPropertyNodeFor(value);
+ } else {
+ throw error("Unknown Unicode property {name=<" + name + ">, "
+ + "value=<" + value + ">}");
+ }*/
+ //} else {
+ /*if (name.startsWith("In")) {
+ // \p{inBlockName}
+ node = unicodeBlockPropertyFor(name.substring(2));
+ } else if (name.startsWith("Is")) {
+ // \p{isGeneralCategory} and \p{isScriptName}
+ name = name.substring(2);
+ UnicodeProp uprop = UnicodeProp.forName(name);
+ if (uprop != null)
+ node = new Utype(uprop);
+ if (node == null)
+ node = CharPropertyNames.charPropertyFor(name);
+ if (node == null)
+ node = unicodeScriptPropertyFor(name);
+ } else {
+ if (has(UNICODE_CHARACTER_CLASS)) {
+ UnicodeProp uprop = UnicodeProp.forPOSIXName(name);
+ if (uprop != null)
+ node = new Utype(uprop);
+ }
+ if (node == null)
+ node = charPropertyNodeFor(name);
+ }*/
+
+ // changes for XPath 3.1 regex compliance
+ if (name.startsWith("Is")) {
+ // \p{isBlockName}
+ node = unicodeBlockPropertyFor(name.substring(2));
+ }
+ else {
+ if (has(UNICODE_CHARACTER_CLASS)) {
+ UnicodeProp uprop = UnicodeProp.forPOSIXName(name);
+ if (uprop != null)
+ node = new Utype(uprop);
+ }
+ if (node == null)
+ node = charPropertyNodeFor(name);
+ }
+ //}
+ if (maybeComplement) {
+ if (node instanceof Category || node instanceof Block)
+ hasSupplementary = true;
+ node = node.complement();
+ }
+ return node;
+ }
+
+
+ /**
+ * Returns a CharProperty matching all characters belong to
+ * a UnicodeScript.
+ */
+ private CharProperty unicodeScriptPropertyFor(String name) {
+ final Character.UnicodeScript script;
+ try {
+ script = Character.UnicodeScript.forName(name);
+ } catch (IllegalArgumentException iae) {
+ throw error("Unknown character script name {" + name + "}");
+ }
+ return new Script(script);
+ }
+
+ /**
+ * Returns a CharProperty matching all characters in a UnicodeBlock.
+ */
+ private CharProperty unicodeBlockPropertyFor(String name) {
+ final Character.UnicodeBlock block;
+ try {
+ block = Character.UnicodeBlock.forName(name);
+ } catch (IllegalArgumentException iae) {
+ throw error("Unknown character block name {" + name + "}");
+ }
+ return new Block(block);
+ }
+
+ /**
+ * Returns a CharProperty matching all characters in a named property.
+ */
+ private CharProperty charPropertyNodeFor(String name) {
+ CharProperty p = CharPropertyNames.charPropertyFor(name);
+ if (p == null)
+ throw error("Unknown character property name {" + name + "}");
+ return p;
+ }
+
+ /**
+ * Parses and returns the name of a "named capturing group", the trailing
+ * ">" is consumed after parsing.
+ */
+ private String groupname(int ch) {
+ StringBuilder sb = new StringBuilder();
+ sb.append(Character.toChars(ch));
+ while (ASCII.isLower(ch=read()) || ASCII.isUpper(ch) ||
+ ASCII.isDigit(ch)) {
+ sb.append(Character.toChars(ch));
+ }
+ if (sb.length() == 0)
+ throw error("named capturing group has 0 length name");
+ if (ch != '>')
+ throw error("named capturing group is missing trailing '>'");
+ return sb.toString();
+ }
+
+ /**
+ * Parses a group and returns the head node of a set of nodes that process
+ * the group. Sometimes a double return system is used where the tail is
+ * returned in root.
+ */
+ private Node group0() {
+ boolean capturingGroup = false;
+ Node head = null;
+ Node tail = null;
+ int save = flags;
+ root = null;
+ int ch = next();
+ if (ch == '?') {
+ ch = skip();
+ switch (ch) {
+ case ':': // (?:xxx) pure group
+ head = createGroup(true);
+ tail = root;
+ head.next = expr(tail);
+ break;
+ case '=': // (?=xxx) and (?!xxx) lookahead
+ case '!':
+ head = createGroup(true);
+ tail = root;
+ head.next = expr(tail);
+ if (ch == '=') {
+ head = tail = new Pos(head);
+ } else {
+ head = tail = new Neg(head);
+ }
+ break;
+ case '>': // (?>xxx) independent group
+ head = createGroup(true);
+ tail = root;
+ head.next = expr(tail);
+ head = tail = new Ques(head, INDEPENDENT);
+ break;
+ case '<': // (?<xxx) look behind
+ ch = read();
+ if (ASCII.isLower(ch) || ASCII.isUpper(ch)) {
+ // named captured group
+ String name = groupname(ch);
+ if (namedGroups().containsKey(name))
+ throw error("Named capturing group <" + name
+ + "> is already defined");
+ capturingGroup = true;
+ head = createGroup(false);
+ tail = root;
+ namedGroups().put(name, capturingGroupCount-1);
+ head.next = expr(tail);
+ break;
+ }
+ int start = cursor;
+ head = createGroup(true);
+ tail = root;
+ head.next = expr(tail);
+ tail.next = lookbehindEnd;
+ TreeInfo info = new TreeInfo();
+ head.study(info);
+ if (info.maxValid == false) {
+ throw error("Look-behind group does not have "
+ + "an obvious maximum length");
+ }
+ boolean hasSupplementary = findSupplementary(start, patternLength);
+ if (ch == '=') {
+ head = tail = (hasSupplementary ?
+ new BehindS(head, info.maxLength,
+ info.minLength) :
+ new Behind(head, info.maxLength,
+ info.minLength));
+ } else if (ch == '!') {
+ head = tail = (hasSupplementary ?
+ new NotBehindS(head, info.maxLength,
+ info.minLength) :
+ new NotBehind(head, info.maxLength,
+ info.minLength));
+ } else {
+ throw error("Unknown look-behind group");
+ }
+ break;
+ case '$':
+ case '@':
+ throw error("Unknown group type");
+ default: // (?xxx:) inlined match flags
+ unread();
+ addFlag();
+ ch = read();
+ if (ch == ')') {
+ return null; // Inline modifier only
+ }
+ if (ch != ':') {
+ throw error("Unknown inline modifier");
+ }
+ head = createGroup(true);
+ tail = root;
+ head.next = expr(tail);
+ break;
+ }
+ } else { // (xxx) a regular group
+ capturingGroup = true;
+ head = createGroup(false);
+ tail = root;
+ head.next = expr(tail);
+ }
+
+ accept(')', "Unclosed group");
+ flags = save;
+
+ // Check for quantifiers
+ Node node = closure(head);
+ if (node == head) { // No closure
+ root = tail;
+ return node; // Dual return
+ }
+ if (head == tail) { // Zero length assertion
+ root = node;
+ return node; // Dual return
+ }
+
+ if (node instanceof Ques) {
+ Ques ques = (Ques) node;
+ if (ques.type == POSSESSIVE) {
+ root = node;
+ return node;
+ }
+ tail.next = new BranchConn();
+ tail = tail.next;
+ if (ques.type == GREEDY) {
+ head = new Branch(head, null, tail);
+ } else { // Reluctant quantifier
+ head = new Branch(null, head, tail);
+ }
+ root = tail;
+ return head;
+ } else if (node instanceof Curly) {
+ Curly curly = (Curly) node;
+ if (curly.type == POSSESSIVE) {
+ root = node;
+ return node;
+ }
+ // Discover if the group is deterministic
+ TreeInfo info = new TreeInfo();
+ if (head.study(info)) { // Deterministic
+ GroupTail temp = (GroupTail) tail;
+ head = root = new GroupCurly(head.next, curly.cmin,
+ curly.cmax, curly.type,
+ ((GroupTail)tail).localIndex,
+ ((GroupTail)tail).groupIndex,
+ capturingGroup);
+ return head;
+ } else { // Non-deterministic
+ int temp = ((GroupHead) head).localIndex;
+ Loop loop;
+ if (curly.type == GREEDY)
+ loop = new Loop(this.localCount, temp);
+ else // Reluctant Curly
+ loop = new LazyLoop(this.localCount, temp);
+ Prolog prolog = new Prolog(loop);
+ this.localCount += 1;
+ loop.cmin = curly.cmin;
+ loop.cmax = curly.cmax;
+ loop.body = head;
+ tail.next = loop;
+ root = loop;
+ return prolog; // Dual return
+ }
+ }
+ throw error("Internal logic error");
+ }
+
+ /**
+ * Create group head and tail nodes using double return. If the group is
+ * created with anonymous true then it is a pure group and should not
+ * affect group counting.
+ */
+ private Node createGroup(boolean anonymous) {
+ int localIndex = localCount++;
+ int groupIndex = 0;
+ if (!anonymous)
+ groupIndex = capturingGroupCount++;
+ GroupHead head = new GroupHead(localIndex);
+ root = new GroupTail(localIndex, groupIndex);
+ if (!anonymous && groupIndex < 10)
+ groupNodes[groupIndex] = head;
+ return head;
+ }
+
+ @SuppressWarnings("fallthrough")
+ /**
+ * Parses inlined match flags and set them appropriately.
+ */
+ private void addFlag() {
+ int ch = peek();
+ for (;;) {
+ switch (ch) {
+ case 'i':
+ flags |= CASE_INSENSITIVE;
+ break;
+ case 'm':
+ flags |= MULTILINE;
+ break;
+ case 's':
+ flags |= DOTALL;
+ break;
+ case 'd':
+ flags |= UNIX_LINES;
+ break;
+ case 'u':
+ flags |= UNICODE_CASE;
+ break;
+ case 'c':
+ flags |= CANON_EQ;
+ break;
+ case 'x':
+ flags |= IGNORE_WHITESPACE;
+ break;
+ case 'U':
+ flags |= (UNICODE_CHARACTER_CLASS | UNICODE_CASE);
+ break;
+ case '-': // subFlag then fall through
+ ch = next();
+ subFlag();
+ default:
+ return;
+ }
+ ch = next();
+ }
+ }
+
+ @SuppressWarnings("fallthrough")
+ /**
+ * Parses the second part of inlined match flags and turns off
+ * flags appropriately.
+ */
+ private void subFlag() {
+ int ch = peek();
+ for (;;) {
+ switch (ch) {
+ case 'i':
+ flags &= ~CASE_INSENSITIVE;
+ break;
+ case 'm':
+ flags &= ~MULTILINE;
+ break;
+ case 's':
+ flags &= ~DOTALL;
+ break;
+ case 'd':
+ flags &= ~UNIX_LINES;
+ break;
+ case 'u':
+ flags &= ~UNICODE_CASE;
+ break;
+ case 'c':
+ flags &= ~CANON_EQ;
+ break;
+ case 'x':
+ flags &= ~IGNORE_WHITESPACE;
+ break;
+ case 'U':
+ flags &= ~(UNICODE_CHARACTER_CLASS | UNICODE_CASE);
+ default:
+ return;
+ }
+ ch = next();
+ }
+ }
+
+ static final int MAX_REPS = 0x7FFFFFFF;
+
+ static final int GREEDY = 0;
+
+ static final int LAZY = 1;
+
+ static final int POSSESSIVE = 2;
+
+ static final int INDEPENDENT = 3;
+
+ /**
+ * Processes repetition. If the next character peeked is a quantifier
+ * then new nodes must be appended to handle the repetition.
+ * Prev could be a single or a group, so it could be a chain of nodes.
+ */
+ private Node closure(Node prev) {
+ Node atom;
+ int ch = peek();
+ switch (ch) {
+ case '?':
+ ch = next();
+ if (ch == '?') {
+ next();
+ return new Ques(prev, LAZY);
+ } else if (ch == '+') {
+ next();
+ return new Ques(prev, POSSESSIVE);
+ }
+ return new Ques(prev, GREEDY);
+ case '*':
+ ch = next();
+ if (ch == '?') {
+ next();
+ return new Curly(prev, 0, MAX_REPS, LAZY);
+ } else if (ch == '+') {
+ next();
+ return new Curly(prev, 0, MAX_REPS, POSSESSIVE);
+ }
+ return new Curly(prev, 0, MAX_REPS, GREEDY);
+ case '+':
+ ch = next();
+ if (ch == '?') {
+ next();
+ return new Curly(prev, 1, MAX_REPS, LAZY);
+ } else if (ch == '+') {
+ next();
+ return new Curly(prev, 1, MAX_REPS, POSSESSIVE);
+ }
+ return new Curly(prev, 1, MAX_REPS, GREEDY);
+ case '{':
+ ch = temp[cursor+1];
+ if (ASCII.isDigit(ch)) {
+ skip();
+ int cmin = 0;
+ do {
+ cmin = cmin * 10 + (ch - '0');
+ } while (ASCII.isDigit(ch = read()));
+ int cmax = cmin;
+ if (ch == ',') {
+ ch = read();
+ cmax = MAX_REPS;
+ if (ch != '}') {
+ cmax = 0;
+ while (ASCII.isDigit(ch)) {
+ cmax = cmax * 10 + (ch - '0');
+ ch = read();
+ }
+ }
+ }
+ if (ch != '}')
+ throw error("Unclosed counted closure");
+ if (((cmin) | (cmax) | (cmax - cmin)) < 0)
+ throw error("Illegal repetition range");
+ Curly curly;
+ ch = peek();
+ if (ch == '?') {
+ next();
+ curly = new Curly(prev, cmin, cmax, LAZY);
+ } else if (ch == '+') {
+ next();
+ curly = new Curly(prev, cmin, cmax, POSSESSIVE);
+ } else {
+ curly = new Curly(prev, cmin, cmax, GREEDY);
+ }
+ return curly;
+ } else {
+ throw error("Illegal repetition");
+ }
+ default:
+ return prev;
+ }
+ }
+
+ /**
+ * Utility method for parsing control escape sequences.
+ */
+ private int c() {
+ if (cursor < patternLength) {
+ return read() ^ 64;
+ }
+ throw error("Illegal control escape sequence");
+ }
+
+ /**
+ * Utility method for parsing octal escape sequences.
+ */
+ private int o() {
+ int n = read();
+ if (((n-'0')|('7'-n)) >= 0) {
+ int m = read();
+ if (((m-'0')|('7'-m)) >= 0) {
+ int o = read();
+ if ((((o-'0')|('7'-o)) >= 0) && (((n-'0')|('3'-n)) >= 0)) {
+ return (n - '0') * 64 + (m - '0') * 8 + (o - '0');
+ }
+ unread();
+ return (n - '0') * 8 + (m - '0');
+ }
+ unread();
+ return (n - '0');
+ }
+ throw error("Illegal octal escape sequence");
+ }
+
+ /**
+ * Utility method for parsing hexadecimal escape sequences.
+ */
+ private int x() {
+ int n = read();
+ if (ASCII.isHexDigit(n)) {
+ int m = read();
+ if (ASCII.isHexDigit(m)) {
+ return ASCII.toDigit(n) * 16 + ASCII.toDigit(m);
+ }
+ } else if (n == '{' && ASCII.isHexDigit(peek())) {
+ int ch = 0;
+ while (ASCII.isHexDigit(n = read())) {
+ ch = (ch << 4) + ASCII.toDigit(n);
+ if (ch > Character.MAX_CODE_POINT)
+ throw error("Hexadecimal codepoint is too big");
+ }
+ if (n != '}')
+ throw error("Unclosed hexadecimal escape sequence");
+ return ch;
+ }
+ throw error("Illegal hexadecimal escape sequence");
+ }
+
+ /**
+ * Utility method for parsing unicode escape sequences.
+ */
+ private int cursor() {
+ return cursor;
+ }
+
+ private void setcursor(int pos) {
+ cursor = pos;
+ }
+
+ private int uxxxx() {
+ int n = 0;
+ for (int i = 0; i < 4; i++) {
+ int ch = read();
+ if (!ASCII.isHexDigit(ch)) {
+ throw error("Illegal Unicode escape sequence");
+ }
+ n = n * 16 + ASCII.toDigit(ch);
+ }
+ return n;
+ }
+
+ private int u() {
+ int n = uxxxx();
+ if (Character.isHighSurrogate((char)n)) {
+ int cur = cursor();
+ if (read() == '\\' && read() == 'u') {
+ int n2 = uxxxx();
+ if (Character.isLowSurrogate((char)n2))
+ return Character.toCodePoint((char)n, (char)n2);
+ }
+ setcursor(cur);
+ }
+ return n;
+ }
+
+ //
+ // Utility methods for code point support
+ //
+
+ private static final int countChars(CharSequence seq, int index,
+ int lengthInCodePoints) {
+ // optimization
+ if (lengthInCodePoints == 1 && !Character.isHighSurrogate(seq.charAt(index))) {
+ assert (index >= 0 && index < seq.length());
+ return 1;
+ }
+ int length = seq.length();
+ int x = index;
+ if (lengthInCodePoints >= 0) {
+ assert (index >= 0 && index < length);
+ for (int i = 0; x < length && i < lengthInCodePoints; i++) {
+ if (Character.isHighSurrogate(seq.charAt(x++))) {
+ if (x < length && Character.isLowSurrogate(seq.charAt(x))) {
+ x++;
+ }
+ }
+ }
+ return x - index;
+ }
+
+ assert (index >= 0 && index <= length);
+ if (index == 0) {
+ return 0;
+ }
+ int len = -lengthInCodePoints;
+ for (int i = 0; x > 0 && i < len; i++) {
+ if (Character.isLowSurrogate(seq.charAt(--x))) {
+ if (x > 0 && Character.isHighSurrogate(seq.charAt(x-1))) {
+ x--;
+ }
+ }
+ }
+ return index - x;
+ }
+
+ private static final int countCodePoints(CharSequence seq) {
+ int length = seq.length();
+ int n = 0;
+ for (int i = 0; i < length; ) {
+ n++;
+ if (Character.isHighSurrogate(seq.charAt(i++))) {
+ if (i < length && Character.isLowSurrogate(seq.charAt(i))) {
+ i++;
+ }
+ }
+ }
+ return n;
+ }
+
+ /**
+ * Creates a bit vector for matching Latin-1 values. A normal BitClass
+ * never matches values above Latin-1, and a complemented BitClass always
+ * matches values above Latin-1.
+ */
+ private static final class BitClass extends BmpCharProperty {
+ final boolean[] bits;
+ BitClass() { bits = new boolean[256]; }
+ private BitClass(boolean[] bits) { this.bits = bits; }
+ BitClass add(int c, int flags) {
+ assert c >= 0 && c <= 255;
+ if ((flags & CASE_INSENSITIVE) != 0) {
+ if (ASCII.isAscii(c)) {
+ bits[ASCII.toUpper(c)] = true;
+ bits[ASCII.toLower(c)] = true;
+ } else if ((flags & UNICODE_CASE) != 0) {
+ bits[Character.toLowerCase(c)] = true;
+ bits[Character.toUpperCase(c)] = true;
+ }
+ }
+ bits[c] = true;
+ return this;
+ }
+ boolean isSatisfiedBy(int ch) {
+ return ch < 256 && bits[ch];
+ }
+ }
+
+ /**
+ * Returns a suitably optimized, single character matcher.
+ */
+ private CharProperty newSingle(final int ch) {
+ if (has(CASE_INSENSITIVE)) {
+ int lower, upper;
+ if (has(UNICODE_CASE)) {
+ upper = Character.toUpperCase(ch);
+ lower = Character.toLowerCase(upper);
+ if (upper != lower)
+ return new SingleU(lower);
+ } else if (ASCII.isAscii(ch)) {
+ lower = ASCII.toLower(ch);
+ upper = ASCII.toUpper(ch);
+ if (lower != upper)
+ return new SingleI(lower, upper);
+ }
+ }
+ if (isSupplementary(ch))
+ return new SingleS(ch); // Match a given Unicode character
+ return new Single(ch); // Match a given BMP character
+ }
+
+ /**
+ * Utility method for creating a string slice matcher.
+ */
+ private Node newSlice(int[] buf, int count, boolean hasSupplementary) {
+ int[] tmp = new int[count];
+ if (has(CASE_INSENSITIVE)) {
+ if (has(UNICODE_CASE)) {
+ for (int i = 0; i < count; i++) {
+ tmp[i] = Character.toLowerCase(
+ Character.toUpperCase(buf[i]));
+ }
+ return hasSupplementary? new SliceUS(tmp) : new SliceU(tmp);
+ }
+ for (int i = 0; i < count; i++) {
+ tmp[i] = ASCII.toLower(buf[i]);
+ }
+ return hasSupplementary? new SliceIS(tmp) : new SliceI(tmp);
+ }
+ for (int i = 0; i < count; i++) {
+ tmp[i] = buf[i];
+ }
+ return hasSupplementary ? new SliceS(tmp) : new Slice(tmp);
+ }
+
+ /**
+ * The following classes are the building components of the object
+ * tree that represents a compiled regular expression. The object tree
+ * is made of individual elements that handle constructs in the Pattern.
+ * Each type of object knows how to match its equivalent construct with
+ * the match() method.
+ */
+
+ /**
+ * Base class for all node classes. Subclasses should override the match()
+ * method as appropriate. This class is an accepting node, so its match()
+ * always returns true.
+ */
+ static class Node extends Object {
+ Node next;
+ Node() {
+ next = Pattern.accept;
+ }
+ /**
+ * This method implements the classic accept node.
+ */
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ matcher.last = i;
+ matcher.groups[0] = matcher.first;
+ matcher.groups[1] = matcher.last;
+ return true;
+ }
+ /**
+ * This method is good for all zero length assertions.
+ */
+ boolean study(TreeInfo info) {
+ if (next != null) {
+ return next.study(info);
+ } else {
+ return info.deterministic;
+ }
+ }
+ }
+
+ static class LastNode extends Node {
+ /**
+ * This method implements the classic accept node with
+ * the addition of a check to see if the match occurred
+ * using all of the input.
+ */
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ if (matcher.acceptMode == Matcher.ENDANCHOR && i != matcher.to)
+ return false;
+ matcher.last = i;
+ matcher.groups[0] = matcher.first;
+ matcher.groups[1] = matcher.last;
+ return true;
+ }
+ }
+
+ /**
+ * Used for REs that can start anywhere within the input string.
+ * This basically tries to match repeatedly at each spot in the
+ * input string, moving forward after each try. An anchored search
+ * or a BnM will bypass this node completely.
+ */
+ static class Start extends Node {
+ int minLength;
+ Start(Node node) {
+ this.next = node;
+ TreeInfo info = new TreeInfo();
+ next.study(info);
+ minLength = info.minLength;
+ }
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ if (i > matcher.to - minLength) {
+ matcher.hitEnd = true;
+ return false;
+ }
+ int guard = matcher.to - minLength;
+ for (; i <= guard; i++) {
+ if (next.match(matcher, i, seq)) {
+ matcher.first = i;
+ matcher.groups[0] = matcher.first;
+ matcher.groups[1] = matcher.last;
+ return true;
+ }
+ }
+ matcher.hitEnd = true;
+ return false;
+ }
+ boolean study(TreeInfo info) {
+ next.study(info);
+ info.maxValid = false;
+ info.deterministic = false;
+ return false;
+ }
+ }
+
+ /*
+ * StartS supports supplementary characters, including unpaired surrogates.
+ */
+ static final class StartS extends Start {
+ StartS(Node node) {
+ super(node);
+ }
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ if (i > matcher.to - minLength) {
+ matcher.hitEnd = true;
+ return false;
+ }
+ int guard = matcher.to - minLength;
+ while (i <= guard) {
+ //if ((ret = next.match(matcher, i, seq)) || i == guard)
+ if (next.match(matcher, i, seq)) {
+ matcher.first = i;
+ matcher.groups[0] = matcher.first;
+ matcher.groups[1] = matcher.last;
+ return true;
+ }
+ if (i == guard)
+ break;
+ // Optimization to move to the next character. This is
+ // faster than countChars(seq, i, 1).
+ if (Character.isHighSurrogate(seq.charAt(i++))) {
+ if (i < seq.length() &&
+ Character.isLowSurrogate(seq.charAt(i))) {
+ i++;
+ }
+ }
+ }
+ matcher.hitEnd = true;
+ return false;
+ }
+ }
+
+ /**
+ * Node to anchor at the beginning of input. This object implements the
+ * match for a \A sequence, and the caret anchor will use this if not in
+ * multiline mode.
+ */
+ static final class Begin extends Node {
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ int fromIndex = (matcher.anchoringBounds) ?
+ matcher.from : 0;
+ if (i == fromIndex && next.match(matcher, i, seq)) {
+ matcher.first = i;
+ matcher.groups[0] = i;
+ matcher.groups[1] = matcher.last;
+ return true;
+ } else {
+ return false;
+ }
+ }
+ }
+
+ /**
+ * Node to anchor at the end of input. This is the absolute end, so this
+ * should not match at the last newline before the end as $ will.
+ */
+ static final class End extends Node {
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ int endIndex = (matcher.anchoringBounds) ?
+ matcher.to : matcher.getTextLength();
+ if (i == endIndex) {
+ matcher.hitEnd = true;
+ return next.match(matcher, i, seq);
+ }
+ return false;
+ }
+ }
+
+ /**
+ * Node to anchor at the beginning of a line. This is essentially the
+ * object to match for the multiline ^.
+ */
+ static final class Caret extends Node {
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ int startIndex = matcher.from;
+ int endIndex = matcher.to;
+ if (!matcher.anchoringBounds) {
+ startIndex = 0;
+ endIndex = matcher.getTextLength();
+ }
+ // Perl does not match ^ at end of input even after newline
+ if (i == endIndex) {
+ matcher.hitEnd = true;
+ return false;
+ }
+ if (i > startIndex) {
+ char ch = seq.charAt(i-1);
+ if (ch != '\n' && ch != '\r'
+ && (ch|1) != '\u2029'
+ && ch != '\u0085' ) {
+ return false;
+ }
+ // Should treat /r/n as one newline
+ if (ch == '\r' && seq.charAt(i) == '\n')
+ return false;
+ }
+ return next.match(matcher, i, seq);
+ }
+ }
+
+ /**
+ * Node to anchor at the beginning of a line when in unixdot mode.
+ */
+ static final class UnixCaret extends Node {
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ int startIndex = matcher.from;
+ int endIndex = matcher.to;
+ if (!matcher.anchoringBounds) {
+ startIndex = 0;
+ endIndex = matcher.getTextLength();
+ }
+ // Perl does not match ^ at end of input even after newline
+ if (i == endIndex) {
+ matcher.hitEnd = true;
+ return false;
+ }
+ if (i > startIndex) {
+ char ch = seq.charAt(i-1);
+ if (ch != '\n') {
+ return false;
+ }
+ }
+ return next.match(matcher, i, seq);
+ }
+ }
+
+ /**
+ * Node to match the location where the last match ended.
+ * This is used for the \G construct.
+ */
+ static final class LastMatch extends Node {
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ if (i != matcher.oldLast)
+ return false;
+ return next.match(matcher, i, seq);
+ }
+ }
+
+ /**
+ * Node to anchor at the end of a line or the end of input based on the
+ * multiline mode.
+ *
+ * When not in multiline mode, the $ can only match at the very end
+ * of the input, unless the input ends in a line terminator in which
+ * it matches right before the last line terminator.
+ *
+ * Note that \r\n is considered an atomic line terminator.
+ *
+ * Like ^ the $ operator matches at a position, it does not match the
+ * line terminators themselves.
+ */
+ static final class Dollar extends Node {
+ boolean multiline;
+ Dollar(boolean mul) {
+ multiline = mul;
+ }
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ int endIndex = (matcher.anchoringBounds) ?
+ matcher.to : matcher.getTextLength();
+ if (!multiline) {
+ if (i < endIndex - 2)
+ return false;
+ if (i == endIndex - 2) {
+ char ch = seq.charAt(i);
+ if (ch != '\r')
+ return false;
+ ch = seq.charAt(i + 1);
+ if (ch != '\n')
+ return false;
+ }
+ }
+ // Matches before any line terminator; also matches at the
+ // end of input
+ // Before line terminator:
+ // If multiline, we match here no matter what
+ // If not multiline, fall through so that the end
+ // is marked as hit; this must be a /r/n or a /n
+ // at the very end so the end was hit; more input
+ // could make this not match here
+ if (i < endIndex) {
+ char ch = seq.charAt(i);
+ if (ch == '\n') {
+ // No match between \r\n
+ if (i > 0 && seq.charAt(i-1) == '\r')
+ return false;
+ if (multiline)
+ return next.match(matcher, i, seq);
+ } else if (ch == '\r' || ch == '\u0085' ||
+ (ch|1) == '\u2029') {
+ if (multiline)
+ return next.match(matcher, i, seq);
+ } else { // No line terminator, no match
+ return false;
+ }
+ }
+ // Matched at current end so hit end
+ matcher.hitEnd = true;
+ // If a $ matches because of end of input, then more input
+ // could cause it to fail!
+ matcher.requireEnd = true;
+ return next.match(matcher, i, seq);
+ }
+ boolean study(TreeInfo info) {
+ next.study(info);
+ return info.deterministic;
+ }
+ }
+
+ /**
+ * Node to anchor at the end of a line or the end of input based on the
+ * multiline mode when in unix lines mode.
+ */
+ static final class UnixDollar extends Node {
+ boolean multiline;
+ UnixDollar(boolean mul) {
+ multiline = mul;
+ }
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ int endIndex = (matcher.anchoringBounds) ?
+ matcher.to : matcher.getTextLength();
+ if (i < endIndex) {
+ char ch = seq.charAt(i);
+ if (ch == '\n') {
+ // If not multiline, then only possible to
+ // match at very end or one before end
+ if (multiline == false && i != endIndex - 1)
+ return false;
+ // If multiline return next.match without setting
+ // matcher.hitEnd
+ if (multiline)
+ return next.match(matcher, i, seq);
+ } else {
+ return false;
+ }
+ }
+ // Matching because at the end or 1 before the end;
+ // more input could change this so set hitEnd
+ matcher.hitEnd = true;
+ // If a $ matches because of end of input, then more input
+ // could cause it to fail!
+ matcher.requireEnd = true;
+ return next.match(matcher, i, seq);
+ }
+ boolean study(TreeInfo info) {
+ next.study(info);
+ return info.deterministic;
+ }
+ }
+
+ /**
+ * Node class that matches a Unicode line ending '\R'
+ */
+ static final class LineEnding extends Node {
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ // (u+000Du+000A|[u+000Au+000Bu+000Cu+000Du+0085u+2028u+2029])
+ if (i < matcher.to) {
+ int ch = seq.charAt(i);
+ if (ch == 0x0A || ch == 0x0B || ch == 0x0C ||
+ ch == 0x85 || ch == 0x2028 || ch == 0x2029)
+ return next.match(matcher, i + 1, seq);
+ if (ch == 0x0D) {
+ i++;
+ if (i < matcher.to && seq.charAt(i) == 0x0A)
+ i++;
+ return next.match(matcher, i, seq);
+ }
+ } else {
+ matcher.hitEnd = true;
+ }
+ return false;
+ }
+ boolean study(TreeInfo info) {
+ info.minLength++;
+ info.maxLength += 2;
+ return next.study(info);
+ }
+ }
+
+ /**
+ * Abstract node class to match one character satisfying some
+ * boolean property.
+ */
+ private static abstract class CharProperty extends Node {
+ abstract boolean isSatisfiedBy(int ch);
+ CharProperty complement() {
+ return new CharProperty() {
+ boolean isSatisfiedBy(int ch) {
+ return ! CharProperty.this.isSatisfiedBy(ch);}};
+ }
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ if (i < matcher.to) {
+ int ch = Character.codePointAt(seq, i);
+ return isSatisfiedBy(ch)
+ && next.match(matcher, i+Character.charCount(ch), seq);
+ } else {
+ matcher.hitEnd = true;
+ return false;
+ }
+ }
+ boolean study(TreeInfo info) {
+ info.minLength++;
+ info.maxLength++;
+ return next.study(info);
+ }
+ }
+
+ /**
+ * Optimized version of CharProperty that works only for
+ * properties never satisfied by Supplementary characters.
+ */
+ private static abstract class BmpCharProperty extends CharProperty {
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ if (i < matcher.to) {
+ return isSatisfiedBy(seq.charAt(i))
+ && next.match(matcher, i+1, seq);
+ } else {
+ matcher.hitEnd = true;
+ return false;
+ }
+ }
+ }
+
+ /**
+ * Node class that matches a Supplementary Unicode character
+ */
+ static final class SingleS extends CharProperty {
+ final int c;
+ SingleS(int c) { this.c = c; }
+ boolean isSatisfiedBy(int ch) {
+ return ch == c;
+ }
+ }
+
+ /**
+ * Optimization -- matches a given BMP character
+ */
+ static final class Single extends BmpCharProperty {
+ final int c;
+ Single(int c) { this.c = c; }
+ boolean isSatisfiedBy(int ch) {
+ return ch == c;
+ }
+ }
+
+ /**
+ * Case insensitive matches a given BMP character
+ */
+ static final class SingleI extends BmpCharProperty {
+ final int lower;
+ final int upper;
+ SingleI(int lower, int upper) {
+ this.lower = lower;
+ this.upper = upper;
+ }
+ boolean isSatisfiedBy(int ch) {
+ return ch == lower || ch == upper;
+ }
+ }
+
+ /**
+ * Unicode case insensitive matches a given Unicode character
+ */
+ static final class SingleU extends CharProperty {
+ final int lower;
+ SingleU(int lower) {
+ this.lower = lower;
+ }
+ boolean isSatisfiedBy(int ch) {
+ return lower == ch ||
+ lower == Character.toLowerCase(Character.toUpperCase(ch));
+ }
+ }
+
+ /**
+ * Node class that matches a Unicode block.
+ */
+ static final class Block extends CharProperty {
+ final Character.UnicodeBlock block;
+ Block(Character.UnicodeBlock block) {
+ this.block = block;
+ }
+ boolean isSatisfiedBy(int ch) {
+ return block == Character.UnicodeBlock.of(ch);
+ }
+ }
+
+ /**
+ * Node class that matches a Unicode script
+ */
+ static final class Script extends CharProperty {
+ final Character.UnicodeScript script;
+ Script(Character.UnicodeScript script) {
+ this.script = script;
+ }
+ boolean isSatisfiedBy(int ch) {
+ return script == Character.UnicodeScript.of(ch);
+ }
+ }
+
+ /**
+ * Node class that matches a Unicode category.
+ */
+ static final class Category extends CharProperty {
+ final int typeMask;
+ Category(int typeMask) { this.typeMask = typeMask; }
+ boolean isSatisfiedBy(int ch) {
+ return (typeMask & (1 << Character.getType(ch))) != 0;
+ }
+ }
+
+ /**
+ * Node class that matches a Unicode "type"
+ */
+ static final class Utype extends CharProperty {
+ final UnicodeProp uprop;
+ Utype(UnicodeProp uprop) { this.uprop = uprop; }
+ boolean isSatisfiedBy(int ch) {
+ return uprop.is(ch);
+ }
+ }
+
+ /**
+ * Node class that matches a POSIX type.
+ */
+ static final class Ctype extends BmpCharProperty {
+ final int ctype;
+ Ctype(int ctype) { this.ctype = ctype; }
+ boolean isSatisfiedBy(int ch) {
+ return ch < 128 && ASCII.isType(ch, ctype);
+ }
+ }
+
+ /**
+ * Node class that matches a Perl vertical whitespace
+ */
+ static final class VertWS extends BmpCharProperty {
+ boolean isSatisfiedBy(int cp) {
+ return (cp >= 0x0A && cp <= 0x0D) ||
+ cp == 0x85 || cp == 0x2028 || cp == 0x2029;
+ }
+ }
+
+ /**
+ * Node class that matches a Perl horizontal whitespace
+ */
+ static final class HorizWS extends BmpCharProperty {
+ boolean isSatisfiedBy(int cp) {
+ return cp == 0x09 || cp == 0x20 || cp == 0xa0 ||
+ cp == 0x1680 || cp == 0x180e ||
+ cp >= 0x2000 && cp <= 0x200a ||
+ cp == 0x202f || cp == 0x205f || cp == 0x3000;
+ }
+ }
+
+ /**
+ * Base class for all Slice nodes
+ */
+ static class SliceNode extends Node {
+ int[] buffer;
+ SliceNode(int[] buf) {
+ buffer = buf;
+ }
+ boolean study(TreeInfo info) {
+ info.minLength += buffer.length;
+ info.maxLength += buffer.length;
+ return next.study(info);
+ }
+ }
+
+ /**
+ * Node class for a case sensitive/BMP-only sequence of literal
+ * characters.
+ */
+ static final class Slice extends SliceNode {
+ Slice(int[] buf) {
+ super(buf);
+ }
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ int[] buf = buffer;
+ int len = buf.length;
+ for (int j=0; j<len; j++) {
+ if ((i+j) >= matcher.to) {
+ matcher.hitEnd = true;
+ return false;
+ }
+ if (buf[j] != seq.charAt(i+j))
+ return false;
+ }
+ return next.match(matcher, i+len, seq);
+ }
+ }
+
+ /**
+ * Node class for a case_insensitive/BMP-only sequence of literal
+ * characters.
+ */
+ static class SliceI extends SliceNode {
+ SliceI(int[] buf) {
+ super(buf);
+ }
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ int[] buf = buffer;
+ int len = buf.length;
+ for (int j=0; j<len; j++) {
+ if ((i+j) >= matcher.to) {
+ matcher.hitEnd = true;
+ return false;
+ }
+ int c = seq.charAt(i+j);
+ if (buf[j] != c &&
+ buf[j] != ASCII.toLower(c))
+ return false;
+ }
+ return next.match(matcher, i+len, seq);
+ }
+ }
+
+ /**
+ * Node class for a unicode_case_insensitive/BMP-only sequence of
+ * literal characters. Uses unicode case folding.
+ */
+ static final class SliceU extends SliceNode {
+ SliceU(int[] buf) {
+ super(buf);
+ }
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ int[] buf = buffer;
+ int len = buf.length;
+ for (int j=0; j<len; j++) {
+ if ((i+j) >= matcher.to) {
+ matcher.hitEnd = true;
+ return false;
+ }
+ int c = seq.charAt(i+j);
+ if (buf[j] != c &&
+ buf[j] != Character.toLowerCase(Character.toUpperCase(c)))
+ return false;
+ }
+ return next.match(matcher, i+len, seq);
+ }
+ }
+
+ /**
+ * Node class for a case sensitive sequence of literal characters
+ * including supplementary characters.
+ */
+ static final class SliceS extends SliceNode {
+ SliceS(int[] buf) {
+ super(buf);
+ }
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ int[] buf = buffer;
+ int x = i;
+ for (int j = 0; j < buf.length; j++) {
+ if (x >= matcher.to) {
+ matcher.hitEnd = true;
+ return false;
+ }
+ int c = Character.codePointAt(seq, x);
+ if (buf[j] != c)
+ return false;
+ x += Character.charCount(c);
+ if (x > matcher.to) {
+ matcher.hitEnd = true;
+ return false;
+ }
+ }
+ return next.match(matcher, x, seq);
+ }
+ }
+
+ /**
+ * Node class for a case insensitive sequence of literal characters
+ * including supplementary characters.
+ */
+ static class SliceIS extends SliceNode {
+ SliceIS(int[] buf) {
+ super(buf);
+ }
+ int toLower(int c) {
+ return ASCII.toLower(c);
+ }
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ int[] buf = buffer;
+ int x = i;
+ for (int j = 0; j < buf.length; j++) {
+ if (x >= matcher.to) {
+ matcher.hitEnd = true;
+ return false;
+ }
+ int c = Character.codePointAt(seq, x);
+ if (buf[j] != c && buf[j] != toLower(c))
+ return false;
+ x += Character.charCount(c);
+ if (x > matcher.to) {
+ matcher.hitEnd = true;
+ return false;
+ }
+ }
+ return next.match(matcher, x, seq);
+ }
+ }
+
+ /**
+ * Node class for a case insensitive sequence of literal characters.
+ * Uses unicode case folding.
+ */
+ static final class SliceUS extends SliceIS {
+ SliceUS(int[] buf) {
+ super(buf);
+ }
+ int toLower(int c) {
+ return Character.toLowerCase(Character.toUpperCase(c));
+ }
+ }
+
+ private static boolean inRange(int lower, int ch, int upper) {
+ return lower <= ch && ch <= upper;
+ }
+
+ /**
+ * Returns node for matching characters within an explicit value range.
+ */
+ private static CharProperty rangeFor(final int lower,
+ final int upper) {
+ return new CharProperty() {
+ boolean isSatisfiedBy(int ch) {
+ return inRange(lower, ch, upper);}};
+ }
+
+ /**
+ * Returns node for matching characters within an explicit value
+ * range in a case insensitive manner.
+ */
+ private CharProperty caseInsensitiveRangeFor(final int lower,
+ final int upper) {
+ if (has(UNICODE_CASE))
+ return new CharProperty() {
+ boolean isSatisfiedBy(int ch) {
+ if (inRange(lower, ch, upper))
+ return true;
+ int up = Character.toUpperCase(ch);
+ return inRange(lower, up, upper) ||
+ inRange(lower, Character.toLowerCase(up), upper);}};
+ return new CharProperty() {
+ boolean isSatisfiedBy(int ch) {
+ return inRange(lower, ch, upper) ||
+ ASCII.isAscii(ch) &&
+ (inRange(lower, ASCII.toUpper(ch), upper) ||
+ inRange(lower, ASCII.toLower(ch), upper));
+ }};
+ }
+
+ /**
+ * Implements the Unicode category ALL and the dot metacharacter when
+ * in dotall mode.
+ */
+ static final class All extends CharProperty {
+ boolean isSatisfiedBy(int ch) {
+ return true;
+ }
+ }
+
+ /**
+ * Node class for the dot metacharacter when dotall is not enabled.
+ */
+ static final class Dot extends CharProperty {
+ boolean isSatisfiedBy(int ch) {
+ return (ch != '\n' && ch != '\r'
+ && (ch|1) != '\u2029'
+ && ch != '\u0085');
+ }
+ }
+
+ /**
+ * Node class for the dot metacharacter when dotall is not enabled
+ * but UNIX_LINES is enabled.
+ */
+ static final class UnixDot extends CharProperty {
+ boolean isSatisfiedBy(int ch) {
+ return ch != '\n';
+ }
+ }
+
+ // changes for XPath 3.1 regex compliance
+ static final class XPath2Dot extends CharProperty {
+ boolean isSatisfiedBy(int ch) {
+ return ch != '\n' && ch != '\r';
+ }
+ }
+
+ // changes for XPath 3.1 regex compliance
+ static final class XPath2Whitespace extends CharProperty {
+ boolean isSatisfiedBy(int ch) {
+ return ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r';
+ }
+ }
+
+ // changes for XPath 3.1 regex compliance
+ static final class XMLNameStartChar extends CharProperty {
+ boolean isSatisfiedBy(int ch) {
+ return XML11Char.isXML11NameStart(ch);
+ }
+ }
+
+ // changes for XPath 3.1 regex compliance
+ static final class XMLNameChar extends CharProperty {
+ boolean isSatisfiedBy(int ch) {
+ return XML11Char.isXML11Name(ch);
+ }
+ }
+
+ /**
+ * The 0 or 1 quantifier. This one class implements all three types.
+ */
+ static final class Ques extends Node {
+ Node atom;
+ int type;
+ Ques(Node node, int type) {
+ this.atom = node;
+ this.type = type;
+ }
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ switch (type) {
+ case GREEDY:
+ return (atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq))
+ || next.match(matcher, i, seq);
+ case LAZY:
+ return next.match(matcher, i, seq)
+ || (atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq));
+ case POSSESSIVE:
+ if (atom.match(matcher, i, seq)) i = matcher.last;
+ return next.match(matcher, i, seq);
+ default:
+ return atom.match(matcher, i, seq) && next.match(matcher, matcher.last, seq);
+ }
+ }
+ boolean study(TreeInfo info) {
+ if (type != INDEPENDENT) {
+ int minL = info.minLength;
+ atom.study(info);
+ info.minLength = minL;
+ info.deterministic = false;
+ return next.study(info);
+ } else {
+ atom.study(info);
+ return next.study(info);
+ }
+ }
+ }
+
+ /**
+ * Handles the curly-brace style repetition with a specified minimum and
+ * maximum occurrences. The * quantifier is handled as a special case.
+ * This class handles the three types.
+ */
+ static final class Curly extends Node {
+ Node atom;
+ int type;
+ int cmin;
+ int cmax;
+
+ Curly(Node node, int cmin, int cmax, int type) {
+ this.atom = node;
+ this.type = type;
+ this.cmin = cmin;
+ this.cmax = cmax;
+ }
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ int j;
+ for (j = 0; j < cmin; j++) {
+ if (atom.match(matcher, i, seq)) {
+ i = matcher.last;
+ continue;
+ }
+ return false;
+ }
+ if (type == GREEDY)
+ return match0(matcher, i, j, seq);
+ else if (type == LAZY)
+ return match1(matcher, i, j, seq);
+ else
+ return match2(matcher, i, j, seq);
+ }
+ // Greedy match.
+ // i is the index to start matching at
+ // j is the number of atoms that have matched
+ boolean match0(Matcher matcher, int i, int j, CharSequence seq) {
+ if (j >= cmax) {
+ // We have matched the maximum... continue with the rest of
+ // the regular expression
+ return next.match(matcher, i, seq);
+ }
+ int backLimit = j;
+ while (atom.match(matcher, i, seq)) {
+ // k is the length of this match
+ int k = matcher.last - i;
+ if (k == 0) // Zero length match
+ break;
+ // Move up index and number matched
+ i = matcher.last;
+ j++;
+ // We are greedy so match as many as we can
+ while (j < cmax) {
+ if (!atom.match(matcher, i, seq))
+ break;
+ if (i + k != matcher.last) {
+ if (match0(matcher, matcher.last, j+1, seq))
+ return true;
+ break;
+ }
+ i += k;
+ j++;
+ }
+ // Handle backing off if match fails
+ while (j >= backLimit) {
+ if (next.match(matcher, i, seq))
+ return true;
+ i -= k;
+ j--;
+ }
+ return false;
+ }
+ return next.match(matcher, i, seq);
+ }
+ // Reluctant match. At this point, the minimum has been satisfied.
+ // i is the index to start matching at
+ // j is the number of atoms that have matched
+ boolean match1(Matcher matcher, int i, int j, CharSequence seq) {
+ for (;;) {
+ // Try finishing match without consuming any more
+ if (next.match(matcher, i, seq))
+ return true;
+ // At the maximum, no match found
+ if (j >= cmax)
+ return false;
+ // Okay, must try one more atom
+ if (!atom.match(matcher, i, seq))
+ return false;
+ // If we haven't moved forward then must break out
+ if (i == matcher.last)
+ return false;
+ // Move up index and number matched
+ i = matcher.last;
+ j++;
+ }
+ }
+ boolean match2(Matcher matcher, int i, int j, CharSequence seq) {
+ for (; j < cmax; j++) {
+ if (!atom.match(matcher, i, seq))
+ break;
+ if (i == matcher.last)
+ break;
+ i = matcher.last;
+ }
+ return next.match(matcher, i, seq);
+ }
+ boolean study(TreeInfo info) {
+ // Save original info
+ int minL = info.minLength;
+ int maxL = info.maxLength;
+ boolean maxV = info.maxValid;
+ boolean detm = info.deterministic;
+ info.reset();
+
+ atom.study(info);
+
+ int temp = info.minLength * cmin + minL;
+ if (temp < minL) {
+ temp = 0xFFFFFFF; // arbitrary large number
+ }
+ info.minLength = temp;
+
+ if (maxV & info.maxValid) {
+ temp = info.maxLength * cmax + maxL;
+ info.maxLength = temp;
+ if (temp < maxL) {
+ info.maxValid = false;
+ }
+ } else {
+ info.maxValid = false;
+ }
+
+ if (info.deterministic && cmin == cmax)
+ info.deterministic = detm;
+ else
+ info.deterministic = false;
+ return next.study(info);
+ }
+ }
+
+ /**
+ * Handles the curly-brace style repetition with a specified minimum and
+ * maximum occurrences in deterministic cases. This is an iterative
+ * optimization over the Prolog and Loop system which would handle this
+ * in a recursive way. The * quantifier is handled as a special case.
+ * If capture is true then this class saves group settings and ensures
+ * that groups are unset when backing off of a group match.
+ */
+ static final class GroupCurly extends Node {
+ Node atom;
+ int type;
+ int cmin;
+ int cmax;
+ int localIndex;
+ int groupIndex;
+ boolean capture;
+
+ GroupCurly(Node node, int cmin, int cmax, int type, int local,
+ int group, boolean capture) {
+ this.atom = node;
+ this.type = type;
+ this.cmin = cmin;
+ this.cmax = cmax;
+ this.localIndex = local;
+ this.groupIndex = group;
+ this.capture = capture;
+ }
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ int[] groups = matcher.groups;
+ int[] locals = matcher.locals;
+ int save0 = locals[localIndex];
+ int save1 = 0;
+ int save2 = 0;
+
+ if (capture) {
+ save1 = groups[groupIndex];
+ save2 = groups[groupIndex+1];
+ }
+
+ // Notify GroupTail there is no need to setup group info
+ // because it will be set here
+ locals[localIndex] = -1;
+
+ boolean ret = true;
+ for (int j = 0; j < cmin; j++) {
+ if (atom.match(matcher, i, seq)) {
+ if (capture) {
+ groups[groupIndex] = i;
+ groups[groupIndex+1] = matcher.last;
+ }
+ i = matcher.last;
+ } else {
+ ret = false;
+ break;
+ }
+ }
+ if (ret) {
+ if (type == GREEDY) {
+ ret = match0(matcher, i, cmin, seq);
+ } else if (type == LAZY) {
+ ret = match1(matcher, i, cmin, seq);
+ } else {
+ ret = match2(matcher, i, cmin, seq);
+ }
+ }
+ if (!ret) {
+ locals[localIndex] = save0;
+ if (capture) {
+ groups[groupIndex] = save1;
+ groups[groupIndex+1] = save2;
+ }
+ }
+ return ret;
+ }
+ // Aggressive group match
+ boolean match0(Matcher matcher, int i, int j, CharSequence seq) {
+ // don't back off passing the starting "j"
+ int min = j;
+ int[] groups = matcher.groups;
+ int save0 = 0;
+ int save1 = 0;
+ if (capture) {
+ save0 = groups[groupIndex];
+ save1 = groups[groupIndex+1];
+ }
+ for (;;) {
+ if (j >= cmax)
+ break;
+ if (!atom.match(matcher, i, seq))
+ break;
+ int k = matcher.last - i;
+ if (k <= 0) {
+ if (capture) {
+ groups[groupIndex] = i;
+ groups[groupIndex+1] = i + k;
+ }
+ i = i + k;
+ break;
+ }
+ for (;;) {
+ if (capture) {
+ groups[groupIndex] = i;
+ groups[groupIndex+1] = i + k;
+ }
+ i = i + k;
+ if (++j >= cmax)
+ break;
+ if (!atom.match(matcher, i, seq))
+ break;
+ if (i + k != matcher.last) {
+ if (match0(matcher, i, j, seq))
+ return true;
+ break;
+ }
+ }
+ while (j > min) {
+ if (next.match(matcher, i, seq)) {
+ if (capture) {
+ groups[groupIndex+1] = i;
+ groups[groupIndex] = i - k;
+ }
+ return true;
+ }
+ // backing off
+ i = i - k;
+ if (capture) {
+ groups[groupIndex+1] = i;
+ groups[groupIndex] = i - k;
+ }
+ j--;
+
+ }
+ break;
+ }
+ if (capture) {
+ groups[groupIndex] = save0;
+ groups[groupIndex+1] = save1;
+ }
+ return next.match(matcher, i, seq);
+ }
+ // Reluctant matching
+ boolean match1(Matcher matcher, int i, int j, CharSequence seq) {
+ for (;;) {
+ if (next.match(matcher, i, seq))
+ return true;
+ if (j >= cmax)
+ return false;
+ if (!atom.match(matcher, i, seq))
+ return false;
+ if (i == matcher.last)
+ return false;
+ if (capture) {
+ matcher.groups[groupIndex] = i;
+ matcher.groups[groupIndex+1] = matcher.last;
+ }
+ i = matcher.last;
+ j++;
+ }
+ }
+ // Possessive matching
+ boolean match2(Matcher matcher, int i, int j, CharSequence seq) {
+ for (; j < cmax; j++) {
+ if (!atom.match(matcher, i, seq)) {
+ break;
+ }
+ if (capture) {
+ matcher.groups[groupIndex] = i;
+ matcher.groups[groupIndex+1] = matcher.last;
+ }
+ if (i == matcher.last) {
+ break;
+ }
+ i = matcher.last;
+ }
+ return next.match(matcher, i, seq);
+ }
+ boolean study(TreeInfo info) {
+ // Save original info
+ int minL = info.minLength;
+ int maxL = info.maxLength;
+ boolean maxV = info.maxValid;
+ boolean detm = info.deterministic;
+ info.reset();
+
+ atom.study(info);
+
+ int temp = info.minLength * cmin + minL;
+ if (temp < minL) {
+ temp = 0xFFFFFFF; // Arbitrary large number
+ }
+ info.minLength = temp;
+
+ if (maxV & info.maxValid) {
+ temp = info.maxLength * cmax + maxL;
+ info.maxLength = temp;
+ if (temp < maxL) {
+ info.maxValid = false;
+ }
+ } else {
+ info.maxValid = false;
+ }
+
+ if (info.deterministic && cmin == cmax) {
+ info.deterministic = detm;
+ } else {
+ info.deterministic = false;
+ }
+ return next.study(info);
+ }
+ }
+
+ /**
+ * A Guard node at the end of each atom node in a Branch. It
+ * serves the purpose of chaining the "match" operation to
+ * "next" but not the "study", so we can collect the TreeInfo
+ * of each atom node without including the TreeInfo of the
+ * "next".
+ */
+ static final class BranchConn extends Node {
+ BranchConn() {};
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ return next.match(matcher, i, seq);
+ }
+ boolean study(TreeInfo info) {
+ return info.deterministic;
+ }
+ }
+
+ /**
+ * Handles the branching of alternations. Note this is also used for
+ * the ? quantifier to branch between the case where it matches once
+ * and where it does not occur.
+ */
+ static final class Branch extends Node {
+ Node[] atoms = new Node[2];
+ int size = 2;
+ Node conn;
+ Branch(Node first, Node second, Node branchConn) {
+ conn = branchConn;
+ atoms[0] = first;
+ atoms[1] = second;
+ }
+
+ void add(Node node) {
+ if (size >= atoms.length) {
+ Node[] tmp = new Node[atoms.length*2];
+ System.arraycopy(atoms, 0, tmp, 0, atoms.length);
+ atoms = tmp;
+ }
+ atoms[size++] = node;
+ }
+
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ for (int n = 0; n < size; n++) {
+ if (atoms[n] == null) {
+ if (conn.next.match(matcher, i, seq))
+ return true;
+ } else if (atoms[n].match(matcher, i, seq)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ boolean study(TreeInfo info) {
+ int minL = info.minLength;
+ int maxL = info.maxLength;
+ boolean maxV = info.maxValid;
+
+ int minL2 = Integer.MAX_VALUE; //arbitrary large enough num
+ int maxL2 = -1;
+ for (int n = 0; n < size; n++) {
+ info.reset();
+ if (atoms[n] != null)
+ atoms[n].study(info);
+ minL2 = Math.min(minL2, info.minLength);
+ maxL2 = Math.max(maxL2, info.maxLength);
+ maxV = (maxV & info.maxValid);
+ }
+
+ minL += minL2;
+ maxL += maxL2;
+
+ info.reset();
+ conn.next.study(info);
+
+ info.minLength += minL;
+ info.maxLength += maxL;
+ info.maxValid &= maxV;
+ info.deterministic = false;
+ return false;
+ }
+ }
+
+ /**
+ * The GroupHead saves the location where the group begins in the locals
+ * and restores them when the match is done.
+ *
+ * The matchRef is used when a reference to this group is accessed later
+ * in the expression. The locals will have a negative value in them to
+ * indicate that we do not want to unset the group if the reference
+ * doesn't match.
+ */
+ static final class GroupHead extends Node {
+ int localIndex;
+ GroupHead(int localCount) {
+ localIndex = localCount;
+ }
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ int save = matcher.locals[localIndex];
+ matcher.locals[localIndex] = i;
+ boolean ret = next.match(matcher, i, seq);
+ matcher.locals[localIndex] = save;
+ return ret;
+ }
+ boolean matchRef(Matcher matcher, int i, CharSequence seq) {
+ int save = matcher.locals[localIndex];
+ matcher.locals[localIndex] = ~i; // HACK
+ boolean ret = next.match(matcher, i, seq);
+ matcher.locals[localIndex] = save;
+ return ret;
+ }
+ }
+
+ /**
+ * Recursive reference to a group in the regular expression. It calls
+ * matchRef because if the reference fails to match we would not unset
+ * the group.
+ */
+ static final class GroupRef extends Node {
+ GroupHead head;
+ GroupRef(GroupHead head) {
+ this.head = head;
+ }
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ return head.matchRef(matcher, i, seq)
+ && next.match(matcher, matcher.last, seq);
+ }
+ boolean study(TreeInfo info) {
+ info.maxValid = false;
+ info.deterministic = false;
+ return next.study(info);
+ }
+ }
+
+ /**
+ * The GroupTail handles the setting of group beginning and ending
+ * locations when groups are successfully matched. It must also be able to
+ * unset groups that have to be backed off of.
+ *
+ * The GroupTail node is also used when a previous group is referenced,
+ * and in that case no group information needs to be set.
+ */
+ static final class GroupTail extends Node {
+ int localIndex;
+ int groupIndex;
+ GroupTail(int localCount, int groupCount) {
+ localIndex = localCount;
+ groupIndex = groupCount + groupCount;
+ }
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ int tmp = matcher.locals[localIndex];
+ if (tmp >= 0) { // This is the normal group case.
+ // Save the group so we can unset it if it
+ // backs off of a match.
+ int groupStart = matcher.groups[groupIndex];
+ int groupEnd = matcher.groups[groupIndex+1];
+
+ matcher.groups[groupIndex] = tmp;
+ matcher.groups[groupIndex+1] = i;
+ if (next.match(matcher, i, seq)) {
+ return true;
+ }
+ matcher.groups[groupIndex] = groupStart;
+ matcher.groups[groupIndex+1] = groupEnd;
+ return false;
+ } else {
+ // This is a group reference case. We don't need to save any
+ // group info because it isn't really a group.
+ matcher.last = i;
+ return true;
+ }
+ }
+ }
+
+ /**
+ * This sets up a loop to handle a recursive quantifier structure.
+ */
+ static final class Prolog extends Node {
+ Loop loop;
+ Prolog(Loop loop) {
+ this.loop = loop;
+ }
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ return loop.matchInit(matcher, i, seq);
+ }
+ boolean study(TreeInfo info) {
+ return loop.study(info);
+ }
+ }
+
+ /**
+ * Handles the repetition count for a greedy Curly. The matchInit
+ * is called from the Prolog to save the index of where the group
+ * beginning is stored. A zero length group check occurs in the
+ * normal match but is skipped in the matchInit.
+ */
+ static class Loop extends Node {
+ Node body;
+ int countIndex; // local count index in matcher locals
+ int beginIndex; // group beginning index
+ int cmin, cmax;
+ Loop(int countIndex, int beginIndex) {
+ this.countIndex = countIndex;
+ this.beginIndex = beginIndex;
+ }
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ // Avoid infinite loop in zero-length case.
+ if (i > matcher.locals[beginIndex]) {
+ int count = matcher.locals[countIndex];
+
+ // This block is for before we reach the minimum
+ // iterations required for the loop to match
+ if (count < cmin) {
+ matcher.locals[countIndex] = count + 1;
+ boolean b = body.match(matcher, i, seq);
+ // If match failed we must backtrack, so
+ // the loop count should NOT be incremented
+ if (!b)
+ matcher.locals[countIndex] = count;
+ // Return success or failure since we are under
+ // minimum
+ return b;
+ }
+ // This block is for after we have the minimum
+ // iterations required for the loop to match
+ if (count < cmax) {
+ matcher.locals[countIndex] = count + 1;
+ boolean b = body.match(matcher, i, seq);
+ // If match failed we must backtrack, so
+ // the loop count should NOT be incremented
+ if (!b)
+ matcher.locals[countIndex] = count;
+ else
+ return true;
+ }
+ }
+ return next.match(matcher, i, seq);
+ }
+ boolean matchInit(Matcher matcher, int i, CharSequence seq) {
+ int save = matcher.locals[countIndex];
+ boolean ret = false;
+ if (0 < cmin) {
+ matcher.locals[countIndex] = 1;
+ ret = body.match(matcher, i, seq);
+ } else if (0 < cmax) {
+ matcher.locals[countIndex] = 1;
+ ret = body.match(matcher, i, seq);
+ if (ret == false)
+ ret = next.match(matcher, i, seq);
+ } else {
+ ret = next.match(matcher, i, seq);
+ }
+ matcher.locals[countIndex] = save;
+ return ret;
+ }
+ boolean study(TreeInfo info) {
+ info.maxValid = false;
+ info.deterministic = false;
+ return false;
+ }
+ }
+
+ /**
+ * Handles the repetition count for a reluctant Curly. The matchInit
+ * is called from the Prolog to save the index of where the group
+ * beginning is stored. A zero length group check occurs in the
+ * normal match but is skipped in the matchInit.
+ */
+ static final class LazyLoop extends Loop {
+ LazyLoop(int countIndex, int beginIndex) {
+ super(countIndex, beginIndex);
+ }
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ // Check for zero length group
+ if (i > matcher.locals[beginIndex]) {
+ int count = matcher.locals[countIndex];
+ if (count < cmin) {
+ matcher.locals[countIndex] = count + 1;
+ boolean result = body.match(matcher, i, seq);
+ // If match failed we must backtrack, so
+ // the loop count should NOT be incremented
+ if (!result)
+ matcher.locals[countIndex] = count;
+ return result;
+ }
+ if (next.match(matcher, i, seq))
+ return true;
+ if (count < cmax) {
+ matcher.locals[countIndex] = count + 1;
+ boolean result = body.match(matcher, i, seq);
+ // If match failed we must backtrack, so
+ // the loop count should NOT be incremented
+ if (!result)
+ matcher.locals[countIndex] = count;
+ return result;
+ }
+ return false;
+ }
+ return next.match(matcher, i, seq);
+ }
+ boolean matchInit(Matcher matcher, int i, CharSequence seq) {
+ int save = matcher.locals[countIndex];
+ boolean ret = false;
+ if (0 < cmin) {
+ matcher.locals[countIndex] = 1;
+ ret = body.match(matcher, i, seq);
+ } else if (next.match(matcher, i, seq)) {
+ ret = true;
+ } else if (0 < cmax) {
+ matcher.locals[countIndex] = 1;
+ ret = body.match(matcher, i, seq);
+ }
+ matcher.locals[countIndex] = save;
+ return ret;
+ }
+ boolean study(TreeInfo info) {
+ info.maxValid = false;
+ info.deterministic = false;
+ return false;
+ }
+ }
+
+ /**
+ * Refers to a group in the regular expression. Attempts to match
+ * whatever the group referred to last matched.
+ */
+ static class BackRef extends Node {
+ int groupIndex;
+ BackRef(int groupCount) {
+ super();
+ groupIndex = groupCount + groupCount;
+ }
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ int j = matcher.groups[groupIndex];
+ int k = matcher.groups[groupIndex+1];
+
+ int groupSize = k - j;
+ // If the referenced group didn't match, neither can this
+ if (j < 0)
+ return false;
+
+ // If there isn't enough input left no match
+ if (i + groupSize > matcher.to) {
+ matcher.hitEnd = true;
+ return false;
+ }
+ // Check each new char to make sure it matches what the group
+ // referenced matched last time around
+ for (int index=0; index<groupSize; index++)
+ if (seq.charAt(i+index) != seq.charAt(j+index))
+ return false;
+
+ return next.match(matcher, i+groupSize, seq);
+ }
+ boolean study(TreeInfo info) {
+ info.maxValid = false;
+ return next.study(info);
+ }
+ }
+
+ static class CIBackRef extends Node {
+ int groupIndex;
+ boolean doUnicodeCase;
+ CIBackRef(int groupCount, boolean doUnicodeCase) {
+ super();
+ groupIndex = groupCount + groupCount;
+ this.doUnicodeCase = doUnicodeCase;
+ }
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ int j = matcher.groups[groupIndex];
+ int k = matcher.groups[groupIndex+1];
+
+ int groupSize = k - j;
+
+ // If the referenced group didn't match, neither can this
+ if (j < 0)
+ return false;
+
+ // If there isn't enough input left no match
+ if (i + groupSize > matcher.to) {
+ matcher.hitEnd = true;
+ return false;
+ }
+
+ // Check each new char to make sure it matches what the group
+ // referenced matched last time around
+ int x = i;
+ for (int index=0; index<groupSize; index++) {
+ int c1 = Character.codePointAt(seq, x);
+ int c2 = Character.codePointAt(seq, j);
+ if (c1 != c2) {
+ if (doUnicodeCase) {
+ int cc1 = Character.toUpperCase(c1);
+ int cc2 = Character.toUpperCase(c2);
+ if (cc1 != cc2 &&
+ Character.toLowerCase(cc1) !=
+ Character.toLowerCase(cc2))
+ return false;
+ } else {
+ if (ASCII.toLower(c1) != ASCII.toLower(c2))
+ return false;
+ }
+ }
+ x += Character.charCount(c1);
+ j += Character.charCount(c2);
+ }
+
+ return next.match(matcher, i+groupSize, seq);
+ }
+ boolean study(TreeInfo info) {
+ info.maxValid = false;
+ return next.study(info);
+ }
+ }
+
+ /**
+ * Searches until the next instance of its atom. This is useful for
+ * finding the atom efficiently without passing an instance of it
+ * (greedy problem) and without a lot of wasted search time (reluctant
+ * problem).
+ */
+ static final class First extends Node {
+ Node atom;
+ First(Node node) {
+ this.atom = BnM.optimize(node);
+ }
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ if (atom instanceof BnM) {
+ return atom.match(matcher, i, seq)
+ && next.match(matcher, matcher.last, seq);
+ }
+ for (;;) {
+ if (i > matcher.to) {
+ matcher.hitEnd = true;
+ return false;
+ }
+ if (atom.match(matcher, i, seq)) {
+ return next.match(matcher, matcher.last, seq);
+ }
+ i += countChars(seq, i, 1);
+ matcher.first++;
+ }
+ }
+ boolean study(TreeInfo info) {
+ atom.study(info);
+ info.maxValid = false;
+ info.deterministic = false;
+ return next.study(info);
+ }
+ }
+
+ static final class Conditional extends Node {
+ Node cond, yes, not;
+ Conditional(Node cond, Node yes, Node not) {
+ this.cond = cond;
+ this.yes = yes;
+ this.not = not;
+ }
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ if (cond.match(matcher, i, seq)) {
+ return yes.match(matcher, i, seq);
+ } else {
+ return not.match(matcher, i, seq);
+ }
+ }
+ boolean study(TreeInfo info) {
+ int minL = info.minLength;
+ int maxL = info.maxLength;
+ boolean maxV = info.maxValid;
+ info.reset();
+ yes.study(info);
+
+ int minL2 = info.minLength;
+ int maxL2 = info.maxLength;
+ boolean maxV2 = info.maxValid;
+ info.reset();
+ not.study(info);
+
+ info.minLength = minL + Math.min(minL2, info.minLength);
+ info.maxLength = maxL + Math.max(maxL2, info.maxLength);
+ info.maxValid = (maxV & maxV2 & info.maxValid);
+ info.deterministic = false;
+ return next.study(info);
+ }
+ }
+
+ /**
+ * Zero width positive lookahead.
+ */
+ static final class Pos extends Node {
+ Node cond;
+ Pos(Node cond) {
+ this.cond = cond;
+ }
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ int savedTo = matcher.to;
+ boolean conditionMatched = false;
+
+ // Relax transparent region boundaries for lookahead
+ if (matcher.transparentBounds)
+ matcher.to = matcher.getTextLength();
+ try {
+ conditionMatched = cond.match(matcher, i, seq);
+ } finally {
+ // Reinstate region boundaries
+ matcher.to = savedTo;
+ }
+ return conditionMatched && next.match(matcher, i, seq);
+ }
+ }
+
+ /**
+ * Zero width negative lookahead.
+ */
+ static final class Neg extends Node {
+ Node cond;
+ Neg(Node cond) {
+ this.cond = cond;
+ }
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ int savedTo = matcher.to;
+ boolean conditionMatched = false;
+
+ // Relax transparent region boundaries for lookahead
+ if (matcher.transparentBounds)
+ matcher.to = matcher.getTextLength();
+ try {
+ if (i < matcher.to) {
+ conditionMatched = !cond.match(matcher, i, seq);
+ } else {
+ // If a negative lookahead succeeds then more input
+ // could cause it to fail!
+ matcher.requireEnd = true;
+ conditionMatched = !cond.match(matcher, i, seq);
+ }
+ } finally {
+ // Reinstate region boundaries
+ matcher.to = savedTo;
+ }
+ return conditionMatched && next.match(matcher, i, seq);
+ }
+ }
+
+ /**
+ * For use with lookbehinds; matches the position where the lookbehind
+ * was encountered.
+ */
+ static Node lookbehindEnd = new Node() {
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ return i == matcher.lookbehindTo;
+ }
+ };
+
+ /**
+ * Zero width positive lookbehind.
+ */
+ static class Behind extends Node {
+ Node cond;
+ int rmax, rmin;
+ Behind(Node cond, int rmax, int rmin) {
+ this.cond = cond;
+ this.rmax = rmax;
+ this.rmin = rmin;
+ }
+
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ int savedFrom = matcher.from;
+ boolean conditionMatched = false;
+ int startIndex = (!matcher.transparentBounds) ?
+ matcher.from : 0;
+ int from = Math.max(i - rmax, startIndex);
+ // Set end boundary
+ int savedLBT = matcher.lookbehindTo;
+ matcher.lookbehindTo = i;
+ // Relax transparent region boundaries for lookbehind
+ if (matcher.transparentBounds)
+ matcher.from = 0;
+ for (int j = i - rmin; !conditionMatched && j >= from; j--) {
+ conditionMatched = cond.match(matcher, j, seq);
+ }
+ matcher.from = savedFrom;
+ matcher.lookbehindTo = savedLBT;
+ return conditionMatched && next.match(matcher, i, seq);
+ }
+ }
+
+ /**
+ * Zero width positive lookbehind, including supplementary
+ * characters or unpaired surrogates.
+ */
+ static final class BehindS extends Behind {
+ BehindS(Node cond, int rmax, int rmin) {
+ super(cond, rmax, rmin);
+ }
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ int rmaxChars = countChars(seq, i, -rmax);
+ int rminChars = countChars(seq, i, -rmin);
+ int savedFrom = matcher.from;
+ int startIndex = (!matcher.transparentBounds) ?
+ matcher.from : 0;
+ boolean conditionMatched = false;
+ int from = Math.max(i - rmaxChars, startIndex);
+ // Set end boundary
+ int savedLBT = matcher.lookbehindTo;
+ matcher.lookbehindTo = i;
+ // Relax transparent region boundaries for lookbehind
+ if (matcher.transparentBounds)
+ matcher.from = 0;
+
+ for (int j = i - rminChars;
+ !conditionMatched && j >= from;
+ j -= j>from ? countChars(seq, j, -1) : 1) {
+ conditionMatched = cond.match(matcher, j, seq);
+ }
+ matcher.from = savedFrom;
+ matcher.lookbehindTo = savedLBT;
+ return conditionMatched && next.match(matcher, i, seq);
+ }
+ }
+
+ /**
+ * Zero width negative lookbehind.
+ */
+ static class NotBehind extends Node {
+ Node cond;
+ int rmax, rmin;
+ NotBehind(Node cond, int rmax, int rmin) {
+ this.cond = cond;
+ this.rmax = rmax;
+ this.rmin = rmin;
+ }
+
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ int savedLBT = matcher.lookbehindTo;
+ int savedFrom = matcher.from;
+ boolean conditionMatched = false;
+ int startIndex = (!matcher.transparentBounds) ?
+ matcher.from : 0;
+ int from = Math.max(i - rmax, startIndex);
+ matcher.lookbehindTo = i;
+ // Relax transparent region boundaries for lookbehind
+ if (matcher.transparentBounds)
+ matcher.from = 0;
+ for (int j = i - rmin; !conditionMatched && j >= from; j--) {
+ conditionMatched = cond.match(matcher, j, seq);
+ }
+ // Reinstate region boundaries
+ matcher.from = savedFrom;
+ matcher.lookbehindTo = savedLBT;
+ return !conditionMatched && next.match(matcher, i, seq);
+ }
+ }
+
+ /**
+ * Zero width negative lookbehind, including supplementary
+ * characters or unpaired surrogates.
+ */
+ static final class NotBehindS extends NotBehind {
+ NotBehindS(Node cond, int rmax, int rmin) {
+ super(cond, rmax, rmin);
+ }
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ int rmaxChars = countChars(seq, i, -rmax);
+ int rminChars = countChars(seq, i, -rmin);
+ int savedFrom = matcher.from;
+ int savedLBT = matcher.lookbehindTo;
+ boolean conditionMatched = false;
+ int startIndex = (!matcher.transparentBounds) ?
+ matcher.from : 0;
+ int from = Math.max(i - rmaxChars, startIndex);
+ matcher.lookbehindTo = i;
+ // Relax transparent region boundaries for lookbehind
+ if (matcher.transparentBounds)
+ matcher.from = 0;
+ for (int j = i - rminChars;
+ !conditionMatched && j >= from;
+ j -= j>from ? countChars(seq, j, -1) : 1) {
+ conditionMatched = cond.match(matcher, j, seq);
+ }
+ //Reinstate region boundaries
+ matcher.from = savedFrom;
+ matcher.lookbehindTo = savedLBT;
+ return !conditionMatched && next.match(matcher, i, seq);
+ }
+ }
+
+ /**
+ * Returns the set union of two CharProperty nodes.
+ */
+ private static CharProperty union(final CharProperty lhs,
+ final CharProperty rhs) {
+ return new CharProperty() {
+ boolean isSatisfiedBy(int ch) {
+ return lhs.isSatisfiedBy(ch) || rhs.isSatisfiedBy(ch);}};
+ }
+
+ /**
+ * Returns the set intersection of two CharProperty nodes.
+ */
+ private static CharProperty intersection(final CharProperty lhs,
+ final CharProperty rhs) {
+ return new CharProperty() {
+ boolean isSatisfiedBy(int ch) {
+ return lhs.isSatisfiedBy(ch) && rhs.isSatisfiedBy(ch);}};
+ }
+
+ /**
+ * Returns the set difference of two CharProperty nodes.
+ */
+ private static CharProperty setDifference(final CharProperty lhs,
+ final CharProperty rhs) {
+ return new CharProperty() {
+ boolean isSatisfiedBy(int ch) {
+ return ! rhs.isSatisfiedBy(ch) && lhs.isSatisfiedBy(ch);}};
+ }
+
+ /**
+ * Handles word boundaries. Includes a field to allow this one class to
+ * deal with the different types of word boundaries we can match. The word
+ * characters include underscores, letters, and digits. Non spacing marks
+ * can are also part of a word if they have a base character, otherwise
+ * they are ignored for purposes of finding word boundaries.
+ */
+ static final class Bound extends Node {
+ static int LEFT = 0x1;
+ static int RIGHT= 0x2;
+ static int BOTH = 0x3;
+ static int NONE = 0x4;
+ int type;
+ boolean useUWORD;
+ Bound(int n, boolean useUWORD) {
+ type = n;
+ this.useUWORD = useUWORD;
+ }
+
+ boolean isWord(int ch) {
+ return useUWORD ? UnicodeProp.WORD.is(ch)
+ : (ch == '_' || Character.isLetterOrDigit(ch));
+ }
+
+ int check(Matcher matcher, int i, CharSequence seq) {
+ int ch;
+ boolean left = false;
+ int startIndex = matcher.from;
+ int endIndex = matcher.to;
+ if (matcher.transparentBounds) {
+ startIndex = 0;
+ endIndex = matcher.getTextLength();
+ }
+ if (i > startIndex) {
+ ch = Character.codePointBefore(seq, i);
+ left = (isWord(ch) ||
+ ((Character.getType(ch) == Character.NON_SPACING_MARK)
+ && hasBaseCharacter(matcher, i-1, seq)));
+ }
+ boolean right = false;
+ if (i < endIndex) {
+ ch = Character.codePointAt(seq, i);
+ right = (isWord(ch) ||
+ ((Character.getType(ch) == Character.NON_SPACING_MARK)
+ && hasBaseCharacter(matcher, i, seq)));
+ } else {
+ // Tried to access char past the end
+ matcher.hitEnd = true;
+ // The addition of another char could wreck a boundary
+ matcher.requireEnd = true;
+ }
+ return ((left ^ right) ? (right ? LEFT : RIGHT) : NONE);
+ }
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ return (check(matcher, i, seq) & type) > 0
+ && next.match(matcher, i, seq);
+ }
+ }
+
+ /**
+ * Non spacing marks only count as word characters in bounds calculations
+ * if they have a base character.
+ */
+ private static boolean hasBaseCharacter(Matcher matcher, int i,
+ CharSequence seq)
+ {
+ int start = (!matcher.transparentBounds) ?
+ matcher.from : 0;
+ for (int x=i; x >= start; x--) {
+ int ch = Character.codePointAt(seq, x);
+ if (Character.isLetterOrDigit(ch))
+ return true;
+ if (Character.getType(ch) == Character.NON_SPACING_MARK)
+ continue;
+ return false;
+ }
+ return false;
+ }
+
+ /**
+ * Attempts to match a slice in the input using the Boyer-Moore string
+ * matching algorithm. The algorithm is based on the idea that the
+ * pattern can be shifted farther ahead in the search text if it is
+ * matched right to left.
+ * <p>
+ * The pattern is compared to the input one character at a time, from
+ * the rightmost character in the pattern to the left. If the characters
+ * all match the pattern has been found. If a character does not match,
+ * the pattern is shifted right a distance that is the maximum of two
+ * functions, the bad character shift and the good suffix shift. This
+ * shift moves the attempted match position through the input more
+ * quickly than a naive one position at a time check.
+ * <p>
+ * The bad character shift is based on the character from the text that
+ * did not match. If the character does not appear in the pattern, the
+ * pattern can be shifted completely beyond the bad character. If the
+ * character does occur in the pattern, the pattern can be shifted to
+ * line the pattern up with the next occurrence of that character.
+ * <p>
+ * The good suffix shift is based on the idea that some subset on the right
+ * side of the pattern has matched. When a bad character is found, the
+ * pattern can be shifted right by the pattern length if the subset does
+ * not occur again in pattern, or by the amount of distance to the
+ * next occurrence of the subset in the pattern.
+ *
+ * Boyer-Moore search methods adapted from code by Amy Yu.
+ */
+ static class BnM extends Node {
+ int[] buffer;
+ int[] lastOcc;
+ int[] optoSft;
+
+ /**
+ * Pre calculates arrays needed to generate the bad character
+ * shift and the good suffix shift. Only the last seven bits
+ * are used to see if chars match; This keeps the tables small
+ * and covers the heavily used ASCII range, but occasionally
+ * results in an aliased match for the bad character shift.
+ */
+ static Node optimize(Node node) {
+ if (!(node instanceof Slice)) {
+ return node;
+ }
+
+ int[] src = ((Slice) node).buffer;
+ int patternLength = src.length;
+ // The BM algorithm requires a bit of overhead;
+ // If the pattern is short don't use it, since
+ // a shift larger than the pattern length cannot
+ // be used anyway.
+ if (patternLength < 4) {
+ return node;
+ }
+ int i, j, k;
+ int[] lastOcc = new int[128];
+ int[] optoSft = new int[patternLength];
+ // Precalculate part of the bad character shift
+ // It is a table for where in the pattern each
+ // lower 7-bit value occurs
+ for (i = 0; i < patternLength; i++) {
+ lastOcc[src[i]&0x7F] = i + 1;
+ }
+ // Precalculate the good suffix shift
+ // i is the shift amount being considered
+NEXT: for (i = patternLength; i > 0; i--) {
+ // j is the beginning index of suffix being considered
+ for (j = patternLength - 1; j >= i; j--) {
+ // Testing for good suffix
+ if (src[j] == src[j-i]) {
+ // src[j..len] is a good suffix
+ optoSft[j-1] = i;
+ } else {
+ // No match. The array has already been
+ // filled up with correct values before.
+ continue NEXT;
+ }
+ }
+ // This fills up the remaining of optoSft
+ // any suffix can not have larger shift amount
+ // then its sub-suffix. Why???
+ while (j > 0) {
+ optoSft[--j] = i;
+ }
+ }
+ // Set the guard value because of unicode compression
+ optoSft[patternLength-1] = 1;
+ if (node instanceof SliceS)
+ return new BnMS(src, lastOcc, optoSft, node.next);
+ return new BnM(src, lastOcc, optoSft, node.next);
+ }
+ BnM(int[] src, int[] lastOcc, int[] optoSft, Node next) {
+ this.buffer = src;
+ this.lastOcc = lastOcc;
+ this.optoSft = optoSft;
+ this.next = next;
+ }
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ int[] src = buffer;
+ int patternLength = src.length;
+ int last = matcher.to - patternLength;
+
+ // Loop over all possible match positions in text
+NEXT: while (i <= last) {
+ // Loop over pattern from right to left
+ for (int j = patternLength - 1; j >= 0; j--) {
+ int ch = seq.charAt(i+j);
+ if (ch != src[j]) {
+ // Shift search to the right by the maximum of the
+ // bad character shift and the good suffix shift
+ i += Math.max(j + 1 - lastOcc[ch&0x7F], optoSft[j]);
+ continue NEXT;
+ }
+ }
+ // Entire pattern matched starting at i
+ matcher.first = i;
+ boolean ret = next.match(matcher, i + patternLength, seq);
+ if (ret) {
+ matcher.first = i;
+ matcher.groups[0] = matcher.first;
+ matcher.groups[1] = matcher.last;
+ return true;
+ }
+ i++;
+ }
+ // BnM is only used as the leading node in the unanchored case,
+ // and it replaced its Start() which always searches to the end
+ // if it doesn't find what it's looking for, so hitEnd is true.
+ matcher.hitEnd = true;
+ return false;
+ }
+ boolean study(TreeInfo info) {
+ info.minLength += buffer.length;
+ info.maxValid = false;
+ return next.study(info);
+ }
+ }
+
+ /**
+ * Supplementary support version of BnM(). Unpaired surrogates are
+ * also handled by this class.
+ */
+ static final class BnMS extends BnM {
+ int lengthInChars;
+
+ BnMS(int[] src, int[] lastOcc, int[] optoSft, Node next) {
+ super(src, lastOcc, optoSft, next);
+ for (int x = 0; x < buffer.length; x++) {
+ lengthInChars += Character.charCount(buffer[x]);
+ }
+ }
+ boolean match(Matcher matcher, int i, CharSequence seq) {
+ int[] src = buffer;
+ int patternLength = src.length;
+ int last = matcher.to - lengthInChars;
+
+ // Loop over all possible match positions in text
+NEXT: while (i <= last) {
+ // Loop over pattern from right to left
+ int ch;
+ for (int j = countChars(seq, i, patternLength), x = patternLength - 1;
+ j > 0; j -= Character.charCount(ch), x--) {
+ ch = Character.codePointBefore(seq, i+j);
+ if (ch != src[x]) {
+ // Shift search to the right by the maximum of the
+ // bad character shift and the good suffix shift
+ int n = Math.max(x + 1 - lastOcc[ch&0x7F], optoSft[x]);
+ i += countChars(seq, i, n);
+ continue NEXT;
+ }
+ }
+ // Entire pattern matched starting at i
+ matcher.first = i;
+ boolean ret = next.match(matcher, i + lengthInChars, seq);
+ if (ret) {
+ matcher.first = i;
+ matcher.groups[0] = matcher.first;
+ matcher.groups[1] = matcher.last;
+ return true;
+ }
+ i += countChars(seq, i, 1);
+ }
+ matcher.hitEnd = true;
+ return false;
+ }
+ }
+
+///////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
+
+ /**
+ * This must be the very first initializer.
+ */
+ static Node accept = new Node();
+
+ static Node lastAccept = new LastNode();
+
+ private static class CharPropertyNames {
+
+ static CharProperty charPropertyFor(String name) {
+ CharPropertyFactory m = map.get(name);
+ return m == null ? null : m.make();
+ }
+
+ private static abstract class CharPropertyFactory {
+ abstract CharProperty make();
+ }
+
+ private static void defCategory(String name,
+ final int typeMask) {
+ map.put(name, new CharPropertyFactory() {
+ CharProperty make() { return new Category(typeMask);}});
+ }
+
+ private static void defRange(String name,
+ final int lower, final int upper) {
+ map.put(name, new CharPropertyFactory() {
+ CharProperty make() { return rangeFor(lower, upper);}});
+ }
+
+ private static void defCtype(String name,
+ final int ctype) {
+ map.put(name, new CharPropertyFactory() {
+ CharProperty make() { return new Ctype(ctype);}});
+ }
+
+ private static abstract class CloneableProperty
+ extends CharProperty implements Cloneable
+ {
+ public CloneableProperty clone() {
+ try {
+ return (CloneableProperty) super.clone();
+ } catch (CloneNotSupportedException e) {
+ throw new AssertionError(e);
+ }
+ }
+ }
+
+ private static void defClone(String name,
+ final CloneableProperty p) {
+ map.put(name, new CharPropertyFactory() {
+ CharProperty make() { return p.clone();}});
+ }
+
+ private static final HashMap<String, CharPropertyFactory> map
+ = new HashMap<>();
+
+ static {
+ // Unicode character property aliases, defined in
+ // http://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt
+ defCategory("Cn", 1<<Character.UNASSIGNED);
+ defCategory("Lu", 1<<Character.UPPERCASE_LETTER);
+ defCategory("Ll", 1<<Character.LOWERCASE_LETTER);
+ defCategory("Lt", 1<<Character.TITLECASE_LETTER);
+ defCategory("Lm", 1<<Character.MODIFIER_LETTER);
+ defCategory("Lo", 1<<Character.OTHER_LETTER);
+ defCategory("Mn", 1<<Character.NON_SPACING_MARK);
+ defCategory("Me", 1<<Character.ENCLOSING_MARK);
+ defCategory("Mc", 1<<Character.COMBINING_SPACING_MARK);
+ defCategory("Nd", 1<<Character.DECIMAL_DIGIT_NUMBER);
+ defCategory("Nl", 1<<Character.LETTER_NUMBER);
+ defCategory("No", 1<<Character.OTHER_NUMBER);
+ defCategory("Zs", 1<<Character.SPACE_SEPARATOR);
+ defCategory("Zl", 1<<Character.LINE_SEPARATOR);
+ defCategory("Zp", 1<<Character.PARAGRAPH_SEPARATOR);
+ defCategory("Cc", 1<<Character.CONTROL);
+ defCategory("Cf", 1<<Character.FORMAT);
+ defCategory("Co", 1<<Character.PRIVATE_USE);
+ defCategory("Cs", 1<<Character.SURROGATE);
+ defCategory("Pd", 1<<Character.DASH_PUNCTUATION);
+ defCategory("Ps", 1<<Character.START_PUNCTUATION);
+ defCategory("Pe", 1<<Character.END_PUNCTUATION);
+ defCategory("Pc", 1<<Character.CONNECTOR_PUNCTUATION);
+ defCategory("Po", 1<<Character.OTHER_PUNCTUATION);
+ defCategory("Sm", 1<<Character.MATH_SYMBOL);
+ defCategory("Sc", 1<<Character.CURRENCY_SYMBOL);
+ defCategory("Sk", 1<<Character.MODIFIER_SYMBOL);
+ defCategory("So", 1<<Character.OTHER_SYMBOL);
+ defCategory("Pi", 1<<Character.INITIAL_QUOTE_PUNCTUATION);
+ defCategory("Pf", 1<<Character.FINAL_QUOTE_PUNCTUATION);
+ defCategory("L", ((1<<Character.UPPERCASE_LETTER) |
+ (1<<Character.LOWERCASE_LETTER) |
+ (1<<Character.TITLECASE_LETTER) |
+ (1<<Character.MODIFIER_LETTER) |
+ (1<<Character.OTHER_LETTER)));
+ defCategory("M", ((1<<Character.NON_SPACING_MARK) |
+ (1<<Character.ENCLOSING_MARK) |
+ (1<<Character.COMBINING_SPACING_MARK)));
+ defCategory("N", ((1<<Character.DECIMAL_DIGIT_NUMBER) |
+ (1<<Character.LETTER_NUMBER) |
+ (1<<Character.OTHER_NUMBER)));
+ defCategory("Z", ((1<<Character.SPACE_SEPARATOR) |
+ (1<<Character.LINE_SEPARATOR) |
+ (1<<Character.PARAGRAPH_SEPARATOR)));
+ defCategory("C", ((1<<Character.CONTROL) |
+ (1<<Character.FORMAT) |
+ (1<<Character.PRIVATE_USE) |
+ (1<<Character.SURROGATE))); // Other
+ defCategory("P", ((1<<Character.DASH_PUNCTUATION) |
+ (1<<Character.START_PUNCTUATION) |
+ (1<<Character.END_PUNCTUATION) |
+ (1<<Character.CONNECTOR_PUNCTUATION) |
+ (1<<Character.OTHER_PUNCTUATION) |
+ (1<<Character.INITIAL_QUOTE_PUNCTUATION) |
+ (1<<Character.FINAL_QUOTE_PUNCTUATION)));
+ defCategory("S", ((1<<Character.MATH_SYMBOL) |
+ (1<<Character.CURRENCY_SYMBOL) |
+ (1<<Character.MODIFIER_SYMBOL) |
+ (1<<Character.OTHER_SYMBOL)));
+ defCategory("LC", ((1<<Character.UPPERCASE_LETTER) |
+ (1<<Character.LOWERCASE_LETTER) |
+ (1<<Character.TITLECASE_LETTER)));
+ defCategory("LD", ((1<<Character.UPPERCASE_LETTER) |
+ (1<<Character.LOWERCASE_LETTER) |
+ (1<<Character.TITLECASE_LETTER) |
+ (1<<Character.MODIFIER_LETTER) |
+ (1<<Character.OTHER_LETTER) |
+ (1<<Character.DECIMAL_DIGIT_NUMBER)));
+ defRange("L1", 0x00, 0xFF); // Latin-1
+ map.put("all", new CharPropertyFactory() {
+ CharProperty make() { return new All(); }});
+
+ // Posix regular expression character classes, defined in
+ // http://www.unix.org/onlinepubs/009695399/basedefs/xbd_chap09.html
+ defRange("ASCII", 0x00, 0x7F); // ASCII
+ defCtype("Alnum", ASCII.ALNUM); // Alphanumeric characters
+ defCtype("Alpha", ASCII.ALPHA); // Alphabetic characters
+ defCtype("Blank", ASCII.BLANK); // Space and tab characters
+ defCtype("Cntrl", ASCII.CNTRL); // Control characters
+ defRange("Digit", '0', '9'); // Numeric characters
+ defCtype("Graph", ASCII.GRAPH); // printable and visible
+ defRange("Lower", 'a', 'z'); // Lower-case alphabetic
+ defRange("Print", 0x20, 0x7E); // Printable characters
+ defCtype("Punct", ASCII.PUNCT); // Punctuation characters
+ defCtype("Space", ASCII.SPACE); // Space characters
+ defRange("Upper", 'A', 'Z'); // Upper-case alphabetic
+ defCtype("XDigit",ASCII.XDIGIT); // hexadecimal digits
+
+ // Java character properties, defined by methods in Character.java
+ defClone("javaLowerCase", new CloneableProperty() {
+ boolean isSatisfiedBy(int ch) {
+ return Character.isLowerCase(ch);}});
+ defClone("javaUpperCase", new CloneableProperty() {
+ boolean isSatisfiedBy(int ch) {
+ return Character.isUpperCase(ch);}});
+ defClone("javaAlphabetic", new CloneableProperty() {
+ boolean isSatisfiedBy(int ch) {
+ return Character.isAlphabetic(ch);}});
+ defClone("javaIdeographic", new CloneableProperty() {
+ boolean isSatisfiedBy(int ch) {
+ return Character.isIdeographic(ch);}});
+ defClone("javaTitleCase", new CloneableProperty() {
+ boolean isSatisfiedBy(int ch) {
+ return Character.isTitleCase(ch);}});
+ defClone("javaDigit", new CloneableProperty() {
+ boolean isSatisfiedBy(int ch) {
+ return Character.isDigit(ch);}});
+ defClone("javaDefined", new CloneableProperty() {
+ boolean isSatisfiedBy(int ch) {
+ return Character.isDefined(ch);}});
+ defClone("javaLetter", new CloneableProperty() {
+ boolean isSatisfiedBy(int ch) {
+ return Character.isLetter(ch);}});
+ defClone("javaLetterOrDigit", new CloneableProperty() {
+ boolean isSatisfiedBy(int ch) {
+ return Character.isLetterOrDigit(ch);}});
+ defClone("javaJavaIdentifierStart", new CloneableProperty() {
+ boolean isSatisfiedBy(int ch) {
+ return Character.isJavaIdentifierStart(ch);}});
+ defClone("javaJavaIdentifierPart", new CloneableProperty() {
+ boolean isSatisfiedBy(int ch) {
+ return Character.isJavaIdentifierPart(ch);}});
+ defClone("javaUnicodeIdentifierStart", new CloneableProperty() {
+ boolean isSatisfiedBy(int ch) {
+ return Character.isUnicodeIdentifierStart(ch);}});
+ defClone("javaUnicodeIdentifierPart", new CloneableProperty() {
+ boolean isSatisfiedBy(int ch) {
+ return Character.isUnicodeIdentifierPart(ch);}});
+ defClone("javaIdentifierIgnorable", new CloneableProperty() {
+ boolean isSatisfiedBy(int ch) {
+ return Character.isIdentifierIgnorable(ch);}});
+ defClone("javaSpaceChar", new CloneableProperty() {
+ boolean isSatisfiedBy(int ch) {
+ return Character.isSpaceChar(ch);}});
+ defClone("javaWhitespace", new CloneableProperty() {
+ boolean isSatisfiedBy(int ch) {
+ return Character.isWhitespace(ch);}});
+ defClone("javaISOControl", new CloneableProperty() {
+ boolean isSatisfiedBy(int ch) {
+ return Character.isISOControl(ch);}});
+ defClone("javaMirrored", new CloneableProperty() {
+ boolean isSatisfiedBy(int ch) {
+ return Character.isMirrored(ch);}});
+ }
+ }
+
+ /**
+ * Creates a predicate which can be used to match a string.
+ *
+ * @return The predicate which can be used for matching on a string
+ * @since 1.8
+ */
+ /*public Predicate<String> asPredicate() {
+ return s -> matcher(s).find();
+ }*/
+
+ /**
+ * Creates a stream from the given input sequence around matches of this
+ * pattern.
+ *
+ * <p> The stream returned by this method contains each substring of the
+ * input sequence that is terminated by another subsequence that matches
+ * this pattern or is terminated by the end of the input sequence. The
+ * substrings in the stream are in the order in which they occur in the
+ * input. Trailing empty strings will be discarded and not encountered in
+ * the stream.
+ *
+ * <p> If this pattern does not match any subsequence of the input then
+ * the resulting stream has just one element, namely the input sequence in
+ * string form.
+ *
+ * <p> When there is a positive-width match at the beginning of the input
+ * sequence then an empty leading substring is included at the beginning
+ * of the stream. A zero-width match at the beginning however never produces
+ * such empty leading substring.
+ *
+ * <p> If the input sequence is mutable, it must remain constant during the
+ * execution of the terminal stream operation. Otherwise, the result of the
+ * terminal stream operation is undefined.
+ *
+ * @param input
+ * The character sequence to be split
+ *
+ * @return The stream of strings computed by splitting the input
+ * around matches of this pattern
+ * @see #split(CharSequence)
+ * @since 1.8
+ */
+ /*
+ public Stream<String> splitAsStream(final CharSequence input) {
+ class MatcherIterator implements Iterator<String> {
+ private final Matcher matcher;
+ // The start position of the next sub-sequence of input
+ // when current == input.length there are no more elements
+ private int current;
+ // null if the next element, if any, needs to obtained
+ private String nextElement;
+ // > 0 if there are N next empty elements
+ private int emptyElementCount;
+
+ MatcherIterator() {
+ this.matcher = matcher(input);
+ }
+
+ public String next() {
+ if (!hasNext())
+ throw new NoSuchElementException();
+
+ if (emptyElementCount == 0) {
+ String n = nextElement;
+ nextElement = null;
+ return n;
+ } else {
+ emptyElementCount--;
+ return "";
+ }
+ }
+
+ public boolean hasNext() {
+ if (nextElement != null || emptyElementCount > 0)
+ return true;
+
+ if (current == input.length())
+ return false;
+
+ // Consume the next matching element
+ // Count sequence of matching empty elements
+ while (matcher.find()) {
+ nextElement = input.subSequence(current, matcher.start()).toString();
+ current = matcher.end();
+ if (!nextElement.isEmpty()) {
+ return true;
+ } else if (current > 0) { // no empty leading substring for zero-width
+ // match at the beginning of the input
+ emptyElementCount++;
+ }
+ }
+
+ // Consume last matching element
+ nextElement = input.subSequence(current, input.length()).toString();
+ current = input.length();
+ if (!nextElement.isEmpty()) {
+ return true;
+ } else {
+ // Ignore a terminal sequence of matching empty elements
+ emptyElementCount = 0;
+ nextElement = null;
+ return false;
+ }
+ }
+ }
+ return StreamSupport.stream(Spliterators.spliteratorUnknownSize(
+ new MatcherIterator(), Spliterator.ORDERED | Spliterator.NONNULL), false);
+ } */
+}
diff --git a/src/org/apache/xpath/regex/PatternSyntaxException.java b/src/org/apache/xpath/regex/PatternSyntaxException.java
new file mode 100644
index 00000000..252951dc
--- /dev/null
+++ b/src/org/apache/xpath/regex/PatternSyntaxException.java
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 1999, 2018, Oracle and/or its affiliates. All rights reserved.
+ * ORACLE PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ */
+
+package org.apache.xpath.regex;
+
+import sun.security.action.GetPropertyAction;
+
+
+/**
+ * Unchecked exception thrown to indicate a syntax error in a
+ * regular-expression pattern.
+ *
+ * @author unascribed
+ * @since 1.4
+ * @spec JSR-51
+ */
+
+public class PatternSyntaxException
+ extends IllegalArgumentException
+{
+ private static final long serialVersionUID = -3864639126226059218L;
+
+ private final String desc;
+ private final String pattern;
+ private final int index;
+
+ /**
+ * Constructs a new instance of this class.
+ *
+ * @param desc
+ * A description of the error
+ *
+ * @param regex
+ * The erroneous pattern
+ *
+ * @param index
+ * The approximate index in the pattern of the error,
+ * or <tt>-1</tt> if the index is not known
+ */
+ public PatternSyntaxException(String desc, String regex, int index) {
+ this.desc = desc;
+ this.pattern = regex;
+ this.index = index;
+ }
+
+ /**
+ * Retrieves the error index.
+ *
+ * @return The approximate index in the pattern of the error,
+ * or <tt>-1</tt> if the index is not known
+ */
+ public int getIndex() {
+ return index;
+ }
+
+ /**
+ * Retrieves the description of the error.
+ *
+ * @return The description of the error
+ */
+ public String getDescription() {
+ return desc;
+ }
+
+ /**
+ * Retrieves the erroneous regular-expression pattern.
+ *
+ * @return The erroneous pattern
+ */
+ public String getPattern() {
+ return pattern;
+ }
+
+ private static final String nl =
+ java.security.AccessController
+ .doPrivileged(new GetPropertyAction("line.separator"));
+
+ /**
+ * Returns a multi-line string containing the description of the syntax
+ * error and its index, the erroneous regular-expression pattern, and a
+ * visual indication of the error index within the pattern.
+ *
+ * @return The full detail message
+ */
+ public String getMessage() {
+ StringBuffer sb = new StringBuffer();
+ sb.append(desc);
+ if (index >= 0) {
+ sb.append(" near index ");
+ sb.append(index);
+ }
+ sb.append(nl);
+ sb.append(pattern);
+ if (index >= 0 && pattern != null && index < pattern.length()) {
+ sb.append(nl);
+ for (int i = 0; i < index; i++) sb.append(' ');
+ sb.append('^');
+ }
+ return sb.toString();
+ }
+
+}
diff --git a/src/org/apache/xpath/regex/UnicodeProp.java b/src/org/apache/xpath/regex/UnicodeProp.java
new file mode 100644
index 00000000..eaa522b3
--- /dev/null
+++ b/src/org/apache/xpath/regex/UnicodeProp.java
@@ -0,0 +1,246 @@
+/*
+ * Copyright (c) 2011, 2013, Oracle and/or its affiliates. All rights reserved.
+ * ORACLE PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ *
+ */
+
+package org.apache.xpath.regex;
+
+import java.util.HashMap;
+import java.util.Locale;
+
+enum UnicodeProp {
+
+ ALPHABETIC {
+ public boolean is(int ch) {
+ return Character.isAlphabetic(ch);
+ }
+ },
+
+ LETTER {
+ public boolean is(int ch) {
+ return Character.isLetter(ch);
+ }
+ },
+
+ IDEOGRAPHIC {
+ public boolean is(int ch) {
+ return Character.isIdeographic(ch);
+ }
+ },
+
+ LOWERCASE {
+ public boolean is(int ch) {
+ return Character.isLowerCase(ch);
+ }
+ },
+
+ UPPERCASE {
+ public boolean is(int ch) {
+ return Character.isUpperCase(ch);
+ }
+ },
+
+ TITLECASE {
+ public boolean is(int ch) {
+ return Character.isTitleCase(ch);
+ }
+ },
+
+ WHITE_SPACE {
+ // \p{Whitespace}
+ public boolean is(int ch) {
+ return ((((1 << Character.SPACE_SEPARATOR) |
+ (1 << Character.LINE_SEPARATOR) |
+ (1 << Character.PARAGRAPH_SEPARATOR)) >> Character.getType(ch)) & 1)
+ != 0 || (ch >= 0x9 && ch <= 0xd) || (ch == 0x85);
+ }
+ },
+
+ CONTROL {
+ // \p{gc=Control}
+ public boolean is(int ch) {
+ return Character.getType(ch) == Character.CONTROL;
+ }
+ },
+
+ PUNCTUATION {
+ // \p{gc=Punctuation}
+ public boolean is(int ch) {
+ return ((((1 << Character.CONNECTOR_PUNCTUATION) |
+ (1 << Character.DASH_PUNCTUATION) |
+ (1 << Character.START_PUNCTUATION) |
+ (1 << Character.END_PUNCTUATION) |
+ (1 << Character.OTHER_PUNCTUATION) |
+ (1 << Character.INITIAL_QUOTE_PUNCTUATION) |
+ (1 << Character.FINAL_QUOTE_PUNCTUATION)) >> Character.getType(ch)) & 1)
+ != 0;
+ }
+ },
+
+ HEX_DIGIT {
+ // \p{gc=Decimal_Number}
+ // \p{Hex_Digit} -> PropList.txt: Hex_Digit
+ public boolean is(int ch) {
+ return DIGIT.is(ch) ||
+ (ch >= 0x0030 && ch <= 0x0039) ||
+ (ch >= 0x0041 && ch <= 0x0046) ||
+ (ch >= 0x0061 && ch <= 0x0066) ||
+ (ch >= 0xFF10 && ch <= 0xFF19) ||
+ (ch >= 0xFF21 && ch <= 0xFF26) ||
+ (ch >= 0xFF41 && ch <= 0xFF46);
+ }
+ },
+
+ ASSIGNED {
+ public boolean is(int ch) {
+ return Character.getType(ch) != Character.UNASSIGNED;
+ }
+ },
+
+ NONCHARACTER_CODE_POINT {
+ // PropList.txt:Noncharacter_Code_Point
+ public boolean is(int ch) {
+ return (ch & 0xfffe) == 0xfffe || (ch >= 0xfdd0 && ch <= 0xfdef);
+ }
+ },
+
+ DIGIT {
+ // \p{gc=Decimal_Number}
+ public boolean is(int ch) {
+ return Character.isDigit(ch);
+ }
+ },
+
+ ALNUM {
+ // \p{alpha}
+ // \p{digit}
+ public boolean is(int ch) {
+ return ALPHABETIC.is(ch) || DIGIT.is(ch);
+ }
+ },
+
+ BLANK {
+ // \p{Whitespace} --
+ // [\N{LF} \N{VT} \N{FF} \N{CR} \N{NEL} -> 0xa, 0xb, 0xc, 0xd, 0x85
+ // \p{gc=Line_Separator}
+ // \p{gc=Paragraph_Separator}]
+ public boolean is(int ch) {
+ return Character.getType(ch) == Character.SPACE_SEPARATOR ||
+ ch == 0x9; // \N{HT}
+ }
+ },
+
+ GRAPH {
+ // [^
+ // \p{space}
+ // \p{gc=Control}
+ // \p{gc=Surrogate}
+ // \p{gc=Unassigned}]
+ public boolean is(int ch) {
+ return ((((1 << Character.SPACE_SEPARATOR) |
+ (1 << Character.LINE_SEPARATOR) |
+ (1 << Character.PARAGRAPH_SEPARATOR) |
+ (1 << Character.CONTROL) |
+ (1 << Character.SURROGATE) |
+ (1 << Character.UNASSIGNED)) >> Character.getType(ch)) & 1)
+ == 0;
+ }
+ },
+
+ PRINT {
+ // \p{graph}
+ // \p{blank}
+ // -- \p{cntrl}
+ public boolean is(int ch) {
+ return (GRAPH.is(ch) || BLANK.is(ch)) && !CONTROL.is(ch);
+ }
+ },
+
+ WORD {
+ // \p{alpha}
+ // \p{gc=Mark}
+ // \p{digit}
+ // \p{gc=Connector_Punctuation}
+ // \p{Join_Control} 200C..200D
+
+ public boolean is(int ch) {
+ return ALPHABETIC.is(ch) ||
+ ((((1 << Character.NON_SPACING_MARK) |
+ (1 << Character.ENCLOSING_MARK) |
+ (1 << Character.COMBINING_SPACING_MARK) |
+ (1 << Character.DECIMAL_DIGIT_NUMBER) |
+ (1 << Character.CONNECTOR_PUNCTUATION)) >> Character.getType(ch)) & 1)
+ != 0 ||
+ JOIN_CONTROL.is(ch);
+ }
+ },
+
+ JOIN_CONTROL {
+ // 200C..200D PropList.txt:Join_Control
+ public boolean is(int ch) {
+ return (ch == 0x200C || ch == 0x200D);
+ }
+ };
+
+ private final static HashMap<String, String> posix = new HashMap<>();
+ private final static HashMap<String, String> aliases = new HashMap<>();
+ static {
+ posix.put("ALPHA", "ALPHABETIC");
+ posix.put("LOWER", "LOWERCASE");
+ posix.put("UPPER", "UPPERCASE");
+ posix.put("SPACE", "WHITE_SPACE");
+ posix.put("PUNCT", "PUNCTUATION");
+ posix.put("XDIGIT","HEX_DIGIT");
+ posix.put("ALNUM", "ALNUM");
+ posix.put("CNTRL", "CONTROL");
+ posix.put("DIGIT", "DIGIT");
+ posix.put("BLANK", "BLANK");
+ posix.put("GRAPH", "GRAPH");
+ posix.put("PRINT", "PRINT");
+
+ aliases.put("WHITESPACE", "WHITE_SPACE");
+ aliases.put("HEXDIGIT","HEX_DIGIT");
+ aliases.put("NONCHARACTERCODEPOINT", "NONCHARACTER_CODE_POINT");
+ aliases.put("JOINCONTROL", "JOIN_CONTROL");
+ }
+
+ public static UnicodeProp forName(String propName) {
+ propName = propName.toUpperCase(Locale.ENGLISH);
+ String alias = aliases.get(propName);
+ if (alias != null)
+ propName = alias;
+ try {
+ return valueOf (propName);
+ } catch (IllegalArgumentException x) {}
+ return null;
+ }
+
+ public static UnicodeProp forPOSIXName(String propName) {
+ propName = posix.get(propName.toUpperCase(Locale.ENGLISH));
+ if (propName == null)
+ return null;
+ return valueOf (propName);
+ }
+
+ public abstract boolean is(int ch);
+}
diff --git a/src/org/apache/xpath/res/XPATHErrorResources.java b/src/org/apache/xpath/res/XPATHErrorResources.java
index 2c8a443a..a10c9a8b 100644
--- a/src/org/apache/xpath/res/XPATHErrorResources.java
+++ b/src/org/apache/xpath/res/XPATHErrorResources.java
@@ -230,6 +230,8 @@ public static final String ER_IGNORABLE_WHITESPACE_NOT_HANDLED =
"ER_FASTSTRINGBUFFER_CANNOT_BE_NULL";
/** 2 or 3 */
public static final String ER_TWO_OR_THREE = "ER_TWO_OR_THREE";
+ /** 3 or 4 */
+ public static final String ER_THREE_OR_FOUR = "ER_THREE_OR_FOUR";
/** Variable accessed before it is bound! */
public static final String ER_VARIABLE_ACCESSED_BEFORE_BIND =
"ER_VARIABLE_ACCESSED_BEFORE_BIND";
@@ -354,6 +356,10 @@ public static final String ER_IGNORABLE_WHITESPACE_NOT_HANDLED =
/** str() not supported by XRTreeFragSelectWrapper */
public static final String ER_STR_NOT_SUPPORTED_XRTREEFRAGSELECTWRAPPER =
"ER_STR_NOT_SUPPORTED_XRTREEFRAGSELECTWRAPPER";
+
+ public static final String ER_INVALID_REGEX_FLAGS = "ER_INVALID_REGEX_FLAGS";
+
+ public static final String ER_INVALID_REGEX = "ER_INVALID_REGEX";
// Error messages...
@@ -616,6 +622,9 @@ public static final String ER_IGNORABLE_WHITESPACE_NOT_HANDLED =
{ ER_TWO_OR_THREE,
"2 or 3"},
+
+ { ER_THREE_OR_FOUR,
+ "3 or 4"},
{ ER_VARIABLE_ACCESSED_BEFORE_BIND,
"Variable accessed before it is bound!"},
@@ -827,6 +836,12 @@ public static final String ER_IGNORABLE_WHITESPACE_NOT_HANDLED =
{ ER_NULL_XPATH_VARIABLE_RESOLVER,
"Attempting to set a null XPathVariableResolver:{0}#setXPathVariableResolver(null)"},
+
+ { ER_INVALID_REGEX_FLAGS,
+ "FORX0001: Invalid regular expression flag(s) usage, with function call {0}."},
+
+ { ER_INVALID_REGEX,
+ "FORX0002: Invalid regular expression syntax used, with function call {0}."},
//END: Definitions of error keys used in exception messages of JAXP 1.3 XPath API implementation
diff --git a/src/org/apache/xpath/res/XPATHErrorResources_sv.java b/src/org/apache/xpath/res/XPATHErrorResources_sv.java
index 68a7802b..9eb20904 100644
--- a/src/org/apache/xpath/res/XPATHErrorResources_sv.java
+++ b/src/org/apache/xpath/res/XPATHErrorResources_sv.java
@@ -750,7 +750,10 @@ public static final int MAX_CODE = 108; // this is needed to keep track of the
{
ER_TWO_OR_THREE,
"2 eller 3"},
-
+
+ {
+ ER_THREE_OR_FOUR,
+ "3 eller 4"},
/** Variable accessed before it is bound! */
//public static final int ER_VARIABLE_ACCESSED_BEFORE_BIND = 85;
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@xalan.apache.org
For additional commands, e-mail: commits-help@xalan.apache.org