You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by ga...@apache.org on 2015/07/22 22:48:03 UTC
[35/50] [abbrv] hive git commit: HIVE-11141 : Improve RuleRegExp when
the Expression node stack gets huge (Hari Subramaniyan,
reviewed by Laljo John Pullokkaran, Jesus Camacho Rodriguez)
HIVE-11141 : Improve RuleRegExp when the Expression node stack gets huge (Hari Subramaniyan, reviewed by Laljo John Pullokkaran, Jesus Camacho Rodriguez)
Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/0ad4f717
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/0ad4f717
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/0ad4f717
Branch: refs/heads/hbase-metastore
Commit: 0ad4f717a7c06bd7bbd90d4b3e861ba1e25d14b7
Parents: 941610f
Author: Hari Subramaniyan <ha...@apache.org>
Authored: Mon Jul 20 17:17:03 2015 -0700
Committer: Hari Subramaniyan <ha...@apache.org>
Committed: Mon Jul 20 17:17:03 2015 -0700
----------------------------------------------------------------------
.../apache/hadoop/hive/ql/lib/RuleRegExp.java | 191 ++++++++++++++++++-
.../hadoop/hive/ql/lib/TestRuleRegExp.java | 118 ++++++++++++
2 files changed, 300 insertions(+), 9 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hive/blob/0ad4f717/ql/src/java/org/apache/hadoop/hive/ql/lib/RuleRegExp.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/lib/RuleRegExp.java b/ql/src/java/org/apache/hadoop/hive/ql/lib/RuleRegExp.java
index ddc96c2..c88ed68 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/lib/RuleRegExp.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/lib/RuleRegExp.java
@@ -18,6 +18,9 @@
package org.apache.hadoop.hive.ql.lib;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
import java.util.Stack;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@@ -31,7 +34,54 @@ import org.apache.hadoop.hive.ql.parse.SemanticException;
public class RuleRegExp implements Rule {
private final String ruleName;
- private final Pattern pattern;
+ private final Pattern patternWithWildCardChar;
+ private final String patternWithoutWildCardChar;
+ private String[] patternORWildChar;
+ private static final Set<Character> wildCards = new HashSet<Character>(Arrays.asList(
+ '[', '^', '$', '*', ']', '+', '|', '(', '\\', '.', '?', ')', '&'));
+
+ /**
+ * The function iterates through the list of wild card characters and sees if
+ * this regular expression contains a wild card character.
+ *
+ * @param pattern
+ * pattern expressed as a regular Expression
+ */
+ private static boolean patternHasWildCardChar(String pattern) {
+ if (pattern == null) {
+ return false;
+ }
+ for (char pc : pattern.toCharArray()) {
+ if (wildCards.contains(pc)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * The function iterates through the list of wild card characters and sees if
+ * this regular expression contains only the given char as wild card character.
+ *
+ * @param pattern
+ * pattern expressed as a regular Expression
+ * @param wcc
+ * wild card character
+ */
+ private static boolean patternHasOnlyWildCardChar(String pattern, char wcc) {
+ if (pattern == null) {
+ return false;
+ }
+ boolean ret = true;
+ boolean hasWildCard = false;
+ for (char pc : pattern.toCharArray()) {
+ if (wildCards.contains(pc)) {
+ hasWildCard = true;
+ ret = ret && (pc == wcc);
+ }
+ }
+ return ret && hasWildCard;
+ }
/**
* The rule specified by the regular expression. Note that, the regular
@@ -46,33 +96,156 @@ public class RuleRegExp implements Rule {
**/
public RuleRegExp(String ruleName, String regExp) {
this.ruleName = ruleName;
- pattern = Pattern.compile(regExp);
+
+ if (patternHasWildCardChar(regExp)) {
+ if (patternHasOnlyWildCardChar(regExp, '|')) {
+ this.patternWithWildCardChar = null;
+ this.patternWithoutWildCardChar = null;
+ this.patternORWildChar = regExp.split("\\|");
+ } else {
+ this.patternWithWildCardChar = Pattern.compile(regExp);
+ this.patternWithoutWildCardChar = null;
+ this.patternORWildChar = null;
+ }
+ } else {
+ this.patternWithWildCardChar = null;
+ this.patternWithoutWildCardChar = regExp;
+ this.patternORWildChar = null;
+ }
}
/**
- * This function returns the cost of the rule for the specified stack. Lower
- * the cost, the better the rule is matched
- *
+ * This function returns the cost of the rule for the specified stack when the pattern
+ * matched for has no wildcard character in it. The function expects patternWithoutWildCardChar
+ * to be not null.
* @param stack
* Node stack encountered so far
* @return cost of the function
* @throws SemanticException
*/
- @Override
- public int cost(Stack<Node> stack) throws SemanticException {
+ private int costPatternWithoutWildCardChar(Stack<Node> stack) throws SemanticException {
int numElems = (stack != null ? stack.size() : 0);
+ String name = new String("");
+ int patLen = patternWithoutWildCardChar.length();
+
+ for (int pos = numElems - 1; pos >= 0; pos--) {
+ name = stack.get(pos).getName() + "%" + name;
+ if (name.length() >= patLen) {
+ if (patternWithoutWildCardChar.equals(name)) {
+ return patLen;
+ } else {
+ return -1;
+ }
+ }
+ }
+ return -1;
+ }
+
+ /**
+ * This function returns the cost of the rule for the specified stack when the pattern
+ * matched for has only OR wildcard character in it. The function expects patternORWildChar
+ * to be not null.
+ * @param stack
+ * Node stack encountered so far
+ * @return cost of the function
+ * @throws SemanticException
+ */
+ private int costPatternWithORWildCardChar(Stack<Node> stack) throws SemanticException {
+ int numElems = (stack != null ? stack.size() : 0);
+ for (String pattern : patternORWildChar) {
+ String name = new String("");
+ int patLen = pattern.length();
+
+ for (int pos = numElems - 1; pos >= 0; pos--) {
+ name = stack.get(pos).getName() + "%" + name;
+ if (name.length() >= patLen) {
+ if (pattern.equals(name)) {
+ return patLen;
+ } else {
+ break;
+ }
+ }
+ }
+ }
+ return -1;
+ }
+
+ /**
+ * This function returns the cost of the rule for the specified stack when the pattern
+ * matched for has wildcard character in it. The function expects patternWithWildCardChar
+ * to be not null.
+ *
+ * @param stack
+ * Node stack encountered so far
+ * @return cost of the function
+ * @throws SemanticException
+ */
+ private int costPatternWithWildCardChar(Stack<Node> stack) throws SemanticException {
+ int numElems = (stack != null ? stack.size() : 0);
String name = "";
+ Matcher m = patternWithWildCardChar.matcher("");
for (int pos = numElems - 1; pos >= 0; pos--) {
name = stack.get(pos).getName() + "%" + name;
- Matcher m = pattern.matcher(name);
+ m.reset(name);
if (m.matches()) {
- return m.group().length();
+ return name.length();
}
}
return -1;
}
/**
+ * Returns true if the rule pattern is valid and has wild character in it.
+ */
+ boolean rulePatternIsValidWithWildCardChar() {
+ return patternWithoutWildCardChar == null && patternWithWildCardChar != null && this.patternORWildChar == null;
+ }
+
+ /**
+ * Returns true if the rule pattern is valid and has wild character in it.
+ */
+ boolean rulePatternIsValidWithoutWildCardChar() {
+ return patternWithWildCardChar == null && patternWithoutWildCardChar != null && this.patternORWildChar == null;
+ }
+
+ /**
+ * Returns true if the rule pattern is valid and has wild character in it.
+ */
+ boolean rulePatternIsValidWithORWildCardChar() {
+ return patternWithoutWildCardChar == null && patternWithWildCardChar == null && this.patternORWildChar != null;
+ }
+
+ /**
+ * This function returns the cost of the rule for the specified stack. Lower
+ * the cost, the better the rule is matched
+ *
+ * @param stack
+ * Node stack encountered so far
+ * @return cost of the function
+ * @throws SemanticException
+ */
+ @Override
+ public int cost(Stack<Node> stack) throws SemanticException {
+ if (rulePatternIsValidWithoutWildCardChar()) {
+ return costPatternWithoutWildCardChar(stack);
+ }
+ if (rulePatternIsValidWithWildCardChar()) {
+ return costPatternWithWildCardChar(stack);
+ }
+ if (rulePatternIsValidWithORWildCardChar()) {
+ return costPatternWithORWildCardChar(stack);
+ }
+ // If we reached here, either :
+ // 1. patternWithWildCardChar and patternWithoutWildCardChar are both nulls.
+ // 2. patternWithWildCardChar and patternWithoutWildCardChar are both not nulls.
+ // This is an internal error and we should not let this happen, so throw an exception.
+ throw new SemanticException (
+ "Rule pattern is invalid for " + getName() + " : patternWithWildCardChar = " +
+ patternWithWildCardChar + " patternWithoutWildCardChar = " +
+ patternWithoutWildCardChar);
+ }
+
+ /**
* @return the name of the Node
**/
@Override
http://git-wip-us.apache.org/repos/asf/hive/blob/0ad4f717/ql/src/test/org/apache/hadoop/hive/ql/lib/TestRuleRegExp.java
----------------------------------------------------------------------
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/lib/TestRuleRegExp.java b/ql/src/test/org/apache/hadoop/hive/ql/lib/TestRuleRegExp.java
new file mode 100644
index 0000000..f06d0df
--- /dev/null
+++ b/ql/src/test/org/apache/hadoop/hive/ql/lib/TestRuleRegExp.java
@@ -0,0 +1,118 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.lib;
+
+import static org.junit.Assert.*;
+
+import java.util.List;
+import java.util.Stack;
+
+import org.apache.hadoop.hive.ql.exec.FileSinkOperator;
+import org.apache.hadoop.hive.ql.exec.FilterOperator;
+import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator;
+import org.apache.hadoop.hive.ql.exec.SelectOperator;
+import org.apache.hadoop.hive.ql.exec.TableScanOperator;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.junit.Test;
+
+public class TestRuleRegExp {
+
+ public class TestNode implements Node {
+ private String name;
+
+ TestNode (String name) {
+ this.name = name;
+ }
+
+ @Override
+ public List<? extends Node> getChildren() {
+ return null;
+ }
+
+ @Override
+ public String getName() {
+ return name;
+ }
+ }
+
+ @Test
+ public void testPatternWithoutWildCardChar() {
+ String patternStr =
+ ReduceSinkOperator.getOperatorName() + "%" +
+ SelectOperator.getOperatorName() + "%" +
+ FileSinkOperator.getOperatorName() + "%";
+ RuleRegExp rule1 = new RuleRegExp("R1", patternStr);
+ assertEquals(rule1.rulePatternIsValidWithoutWildCardChar(), true);
+ assertEquals(rule1.rulePatternIsValidWithWildCardChar(), false);
+ // positive test
+ Stack<Node> ns1 = new Stack<Node>();
+ ns1.push(new TestNode(ReduceSinkOperator.getOperatorName()));
+ ns1.push(new TestNode(SelectOperator.getOperatorName()));
+ ns1.push(new TestNode(FileSinkOperator.getOperatorName()));
+ try {
+ assertEquals(rule1.cost(ns1), patternStr.length());
+ } catch (SemanticException e) {
+ fail(e.getMessage());
+ }
+ // negative test
+ Stack<Node> ns2 = new Stack<Node>();
+ ns2.push(new TestNode(ReduceSinkOperator.getOperatorName()));
+ ns1.push(new TestNode(TableScanOperator.getOperatorName()));
+ ns1.push(new TestNode(FileSinkOperator.getOperatorName()));
+ try {
+ assertEquals(rule1.cost(ns2), -1);
+ } catch (SemanticException e) {
+ fail(e.getMessage());
+ }
+ }
+
+ @Test
+ public void testPatternWithWildCardChar() {
+ RuleRegExp rule1 = new RuleRegExp("R1",
+ "(" + TableScanOperator.getOperatorName() + "%"
+ + FilterOperator.getOperatorName() + "%)|("
+ + TableScanOperator.getOperatorName() + "%"
+ + FileSinkOperator.getOperatorName() + "%)");
+ assertEquals(rule1.rulePatternIsValidWithoutWildCardChar(), false);
+ assertEquals(rule1.rulePatternIsValidWithWildCardChar(), true);
+ // positive test
+ Stack<Node> ns1 = new Stack<Node>();
+ ns1.push(new TestNode(TableScanOperator.getOperatorName()));
+ ns1.push(new TestNode(FilterOperator.getOperatorName()));
+ Stack<Node> ns2 = new Stack<Node>();
+ ns2.push(new TestNode(TableScanOperator.getOperatorName()));
+ ns2.push(new TestNode(FileSinkOperator.getOperatorName()));
+ try {
+ assertNotEquals(rule1.cost(ns1), -1);
+ assertNotEquals(rule1.cost(ns2), -1);
+ } catch (SemanticException e) {
+ fail(e.getMessage());
+ }
+ // negative test
+ Stack<Node> ns3 = new Stack<Node>();
+ ns3.push(new TestNode(ReduceSinkOperator.getOperatorName()));
+ ns3.push(new TestNode(ReduceSinkOperator.getOperatorName()));
+ ns3.push(new TestNode(FileSinkOperator.getOperatorName()));
+ try {
+ assertEquals(rule1.cost(ns3), -1);
+ } catch (SemanticException e) {
+ fail(e.getMessage());
+ }
+ }
+
+}