You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@drill.apache.org by am...@apache.org on 2018/01/24 18:35:19 UTC
[05/11] drill git commit: DRILL-5879: Improved SQL Pattern Contains
Performance
DRILL-5879: Improved SQL Pattern Contains Performance
close apache/drill#1072
Project: http://git-wip-us.apache.org/repos/asf/drill/repo
Commit: http://git-wip-us.apache.org/repos/asf/drill/commit/2420b35d
Tree: http://git-wip-us.apache.org/repos/asf/drill/tree/2420b35d
Diff: http://git-wip-us.apache.org/repos/asf/drill/diff/2420b35d
Branch: refs/heads/master
Commit: 2420b35d716a35a10be91ea82ec17bdeb392a77c
Parents: 48623ea
Author: Salim Achouche <sa...@gmail.com>
Authored: Wed Dec 13 14:24:45 2017 -0800
Committer: Aman Sinha <as...@maprtech.com>
Committed: Tue Jan 23 17:37:17 2018 -0800
----------------------------------------------------------------------
.../expr/fn/impl/SqlPatternContainsMatcher.java | 274 ++++++++++++++++++-
.../exec/expr/fn/impl/TestSqlPatterns.java | 112 +++++++-
2 files changed, 363 insertions(+), 23 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/drill/blob/2420b35d/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/SqlPatternContainsMatcher.java
----------------------------------------------------------------------
diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/SqlPatternContainsMatcher.java b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/SqlPatternContainsMatcher.java
index 04f5dac..ec95349 100644
--- a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/SqlPatternContainsMatcher.java
+++ b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/SqlPatternContainsMatcher.java
@@ -19,44 +19,292 @@ package org.apache.drill.exec.expr.fn.impl;
import io.netty.buffer.DrillBuf;
-public class SqlPatternContainsMatcher extends AbstractSqlPatternMatcher {
+/** SQL Pattern Contains implementation */
+public final class SqlPatternContainsMatcher extends AbstractSqlPatternMatcher {
+ private final MatcherFcn matcherFcn;
public SqlPatternContainsMatcher(String patternString) {
super(patternString);
+
+ // Pattern matching is 1) a CPU intensive operation and 2) pattern and input dependent. The conclusion is
+ // that there is no single implementation that can do it all well. So, we use multiple implementations
+ // chosen based on the pattern length.
+ if (patternLength == 0) {
+ matcherFcn = new MatcherZero();
+ } else if (patternLength == 1) {
+ matcherFcn = new MatcherOne();
+ } else if (patternLength == 2) {
+ matcherFcn = new MatcherTwo();
+ } else if (patternLength == 3) {
+ matcherFcn = new MatcherThree();
+ } else if (patternLength < 10) {
+ matcherFcn = new MatcherN();
+ } else {
+ matcherFcn = new BoyerMooreMatcher();
+ }
}
@Override
public int match(int start, int end, DrillBuf drillBuf) {
+ return matcherFcn.match(start, end, drillBuf);
+ }
+
+ //--------------------------------------------------------------------------
+ // Inner Data Structure
+ // --------------------------------------------------------------------------
+
+ /** Abstract matcher class to allow us pick the most efficient implementation */
+ private abstract class MatcherFcn {
+ protected final byte[] patternArray;
+
+ protected MatcherFcn() {
+ assert patternByteBuffer.hasArray();
+
+ patternArray = patternByteBuffer.array();
+ }
+
+ /**
+ * @return 1 if the pattern was matched; 0 otherwise
+ */
+ protected abstract int match(int start, int end, DrillBuf drillBuf);
+ }
+
+ /** Handles patterns with length zero */
+ private final class MatcherZero extends MatcherFcn {
- if (patternLength == 0) { // Everything should match for null pattern string
+ private MatcherZero() {
+ }
+
+ /** {@inheritDoc} */
+ @Override
+ protected final int match(int start, int end, DrillBuf drillBuf) {
return 1;
}
+ }
+
+ /** Handles patterns with length one */
+ private final class MatcherOne extends MatcherFcn {
+ final byte firstPatternByte;
+
+ private MatcherOne() {
+ firstPatternByte = patternArray[0];
+ }
+
+ /** {@inheritDoc} */
+ @Override
+ protected final int match(int start, int end, DrillBuf drillBuf) {
+ final int lengthToProcess = end - start;
+
+ // simplePattern string has meta characters i.e % and _ and escape characters removed.
+ // so, we can just directly compare.
+ for (int idx = 0; idx < lengthToProcess; idx++) {
+ byte inputByte = drillBuf.getByte(start + idx);
+
+ if (firstPatternByte != inputByte) {
+ continue;
+ }
+ return 1;
+ }
+ return 0;
+ }
+ }
+
+ /** Handles patterns with length two */
+ private final class MatcherTwo extends MatcherFcn {
+ final byte firstPatternByte;
+ final byte secondPatternByte;
+
+ private MatcherTwo() {
+ firstPatternByte = patternArray[0];
+ secondPatternByte = patternArray[1];
+ }
+
+ /** {@inheritDoc} */
+ @Override
+ protected final int match(int start, int end, DrillBuf drillBuf) {
+ final int lengthToProcess = end - start - 1;
+
+ // simplePattern string has meta characters i.e % and _ and escape characters removed.
+ // so, we can just directly compare.
+ for (int idx = 0; idx < lengthToProcess; idx++) {
+ final byte firstInByte = drillBuf.getByte(start + idx);
- final int txtLength = end - start;
+ if (firstPatternByte != firstInByte) {
+ continue;
+ } else {
+ final byte secondInByte = drillBuf.getByte(start + idx + 1);
- // no match if input string length is less than pattern length
- if (txtLength < patternLength) {
+ if (secondInByte == secondPatternByte) {
+ return 1;
+ }
+ }
+ }
return 0;
}
+ }
+ /** Handles patterns with length three */
+ private final class MatcherThree extends MatcherFcn {
+ final byte firstPatternByte;
+ final byte secondPatternByte;
+ final byte thirdPatternByte;
- final int outerEnd = txtLength - patternLength;
+ private MatcherThree() {
+ firstPatternByte = patternArray[0];
+ secondPatternByte = patternArray[1];
+ thirdPatternByte = patternArray[2];
+ }
- outer:
- for (int txtIndex = 0; txtIndex <= outerEnd; txtIndex++) {
+ /** {@inheritDoc} */
+ @Override
+ protected final int match(int start, int end, DrillBuf drillBuf) {
+ final int lengthToProcess = end - start - 2;
// simplePattern string has meta characters i.e % and _ and escape characters removed.
// so, we can just directly compare.
- for (int patternIndex = 0; patternIndex < patternLength; patternIndex++) {
- if (patternByteBuffer.get(patternIndex) != drillBuf.getByte(start + txtIndex + patternIndex)) {
- continue outer;
+ for (int idx = 0; idx < lengthToProcess; idx++) {
+ final byte inputByte = drillBuf.getByte(start + idx);
+
+ if (firstPatternByte != inputByte) {
+ continue;
+ } else {
+ final byte secondInByte = drillBuf.getByte(start + idx + 1);
+ final byte thirdInByte = drillBuf.getByte(start + idx + 2);
+
+ if (secondInByte == secondPatternByte && thirdInByte == thirdPatternByte) {
+ return 1;
+ }
}
}
+ return 0;
+ }
+ }
- return 1;
+ /** Handles patterns with arbitrary length */
+ private final class MatcherN extends MatcherFcn {
+ final byte firstPatternByte;
+
+ private MatcherN() {
+ firstPatternByte = patternArray[0];
+ }
+
+ /** {@inheritDoc} */
+ @Override
+ protected final int match(int start, int end, DrillBuf drillBuf) {
+ final int lengthToProcess = end - start - patternLength + 1;
+ int patternIndex = 0;
+
+ // simplePattern string has meta characters i.e % and _ and escape characters removed.
+ // so, we can just directly compare.
+ for (int idx = 0; idx < lengthToProcess; idx++) {
+ final byte inputByte = drillBuf.getByte(start + idx);
+
+ if (firstPatternByte == inputByte) {
+ for (patternIndex = 1; patternIndex < patternLength; ++patternIndex) {
+ final byte currInByte = drillBuf.getByte(start + idx + patternIndex);
+ final byte currPattByte = patternArray[patternIndex];
+
+ if (currInByte != currPattByte) {
+ break;
+ }
+ }
+
+ if (patternIndex == patternLength) {
+ return 1;
+ }
+ }
+ }
+ return 0;
+ }
+ }
+
+ /**
+ * Boyer-Moore matcher algorithm; excellent for large patterns and for prefix patterns which appear
+ * frequently in the input.
+ */
+ private final class BoyerMooreMatcher extends MatcherFcn {
+ private final int[] offsetTable;
+ private final int[] characterTable;
+
+ private BoyerMooreMatcher() {
+ super();
+
+ this.offsetTable = makeOffsetTable();
+ this.characterTable = makeCharTable();
+ }
+
+ /** {@inheritDoc} */
+ @Override
+ protected int match(int start, int end, DrillBuf drillBuf) {
+ final int inputLength = end - start;
+
+ for (int idx1 = patternLength - 1, idx2; idx1 < inputLength;) {
+ for (idx2 = patternLength - 1; patternArray[idx2] == drillBuf.getByte(start + idx1); --idx1, --idx2) {
+ if (idx2 == 0) {
+ return 1;
+ }
+ }
+ // idx1 += pattern.length - idx2; // For naive method
+ idx1 += Math.max(offsetTable[patternLength - 1 - idx2], characterTable[drillBuf.getByte(start + idx1) & 0xFF]);
+ }
+ return 0;
+ }
+
+ /** Build the jump table based on the mismatched character information **/
+ private int[] makeCharTable() {
+ final int TABLE_SIZE = 256; // This implementation is based on byte comparison
+ int[] resultTable = new int[TABLE_SIZE];
+
+ for (int idx = 0; idx < resultTable.length; ++idx) {
+ resultTable[idx] = patternLength;
+ }
+
+ for (int idx = 0; idx < patternLength - 1; ++idx) {
+ final int patternValue = ((int) patternArray[idx]) & 0xFF;
+ resultTable[patternValue] = patternLength - 1 - idx;
+ }
+
+ return resultTable;
+ }
+
+ /** Builds the scan offset based on which mismatch occurs. **/
+ private int[] makeOffsetTable() {
+ int[] resultTable = new int[patternLength];
+ int lastPrefixPosition = patternLength;
+
+ for (int idx = patternLength - 1; idx >= 0; --idx) {
+ if (isPrefix(idx + 1)) {
+ lastPrefixPosition = idx + 1;
+ }
+ resultTable[patternLength - 1 - idx] = lastPrefixPosition - idx + patternLength - 1;
+ }
+
+ for (int idx = 0; idx < patternLength - 1; ++idx) {
+ int suffixLen = suffixLength(idx);
+ resultTable[suffixLen] = patternLength - 1 - idx + suffixLen;
+ }
+
+ return resultTable;
}
- return 0;
+ /** Checks whether needle[pos:end] is a prefix of pattern **/
+ private boolean isPrefix(int pos) {
+ for (int idx1 = pos, idx2 = 0; idx1 < patternLength; ++idx1, ++idx2) {
+ if (patternArray[idx1] != patternArray[idx2]) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ /** Computes the maximum length of the substring ends at "pos" and is a suffix **/
+ private int suffixLength(int pos) {
+ int result = 0;
+ for (int idx1 = pos, idx2 = patternLength - 1; idx1 >= 0 && patternArray[idx1] == patternArray[idx2]; --idx1, --idx2) {
+ result += 1;
+ }
+ return result;
+ }
}
}
http://git-wip-us.apache.org/repos/asf/drill/blob/2420b35d/exec/java-exec/src/test/java/org/apache/drill/exec/expr/fn/impl/TestSqlPatterns.java
----------------------------------------------------------------------
diff --git a/exec/java-exec/src/test/java/org/apache/drill/exec/expr/fn/impl/TestSqlPatterns.java b/exec/java-exec/src/test/java/org/apache/drill/exec/expr/fn/impl/TestSqlPatterns.java
index 2eecb54..7d85719 100644
--- a/exec/java-exec/src/test/java/org/apache/drill/exec/expr/fn/impl/TestSqlPatterns.java
+++ b/exec/java-exec/src/test/java/org/apache/drill/exec/expr/fn/impl/TestSqlPatterns.java
@@ -17,23 +17,25 @@
*/
package org.apache.drill.exec.expr.fn.impl;
-import io.netty.buffer.DrillBuf;
-import org.apache.drill.common.exceptions.DrillRuntimeException;
-import org.apache.drill.exec.memory.BufferAllocator;
-import org.apache.drill.exec.memory.RootAllocatorFactory;
-import org.junit.After;
-import org.junit.Before;
-import org.junit.Ignore;
-import org.junit.Test;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder;
+import java.util.ArrayList;
+import java.util.List;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
+import org.apache.drill.common.exceptions.DrillRuntimeException;
+import org.apache.drill.exec.memory.BufferAllocator;
+import org.apache.drill.exec.memory.RootAllocatorFactory;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+import io.netty.buffer.DrillBuf;
public class TestSqlPatterns {
BufferAllocator allocator;
@@ -446,10 +448,100 @@ public class TestSqlPatterns {
assertEquals(1, sqlPatternComplex.match(0, byteBuffer.limit(), drillBuf)); // should match
}
+ @Test
+ public void testSqlPatternContainsMUltipleMatchers() {
+
+ final String longASCIIString = "Drill supports a variety of NoSQL databases and file systems, including HBase, MongoDB, MapR-DB, HDFS, MapR-FS, Amazon S3, Azure Blob Storage, Google Cloud Storage, Swift, "
+ + "NAS and local files. A single query can join data from multiple datastores. For example, you can join a user profile collection in MongoDB with a directory of event logs in Hadoop.";
+ final String emptyString = "";
+ final String unicodeString = "¤EÀsÆW°ê»Ú®i¶T¤¤¤ß3¼Ó®i¶TÆU2~~";
+
+ final List<SQLPatternTestParams> tests = new ArrayList<SQLPatternTestParams>();
+
+ // Tests for Matcher ZERO
+ tests.add(new SQLPatternTestParams(longASCIIString, "", true));
+ tests.add(new SQLPatternTestParams(emptyString, "", true));
+ tests.add(new SQLPatternTestParams(unicodeString, "", true));
+
+ // Tests for Matcher ONE
+ tests.add(new SQLPatternTestParams(longASCIIString, "N", true));
+ tests.add(new SQLPatternTestParams(longASCIIString, "&", false));
+ tests.add(new SQLPatternTestParams(emptyString, "N", false));
+
+ // Tests for Matcher TWO
+ tests.add(new SQLPatternTestParams(longASCIIString, "SQ", true));
+ tests.add(new SQLPatternTestParams(longASCIIString, "eT", false));
+ tests.add(new SQLPatternTestParams("A", "SQ", false));
+ tests.add(new SQLPatternTestParams(emptyString, "SQ", false));
+ tests.add(new SQLPatternTestParams(unicodeString, "¶", true));
+ tests.add(new SQLPatternTestParams(unicodeString, "AT", false));
+
+ // Tests for Matcher THREE
+ tests.add(new SQLPatternTestParams(longASCIIString, "SQL", true));
+ tests.add(new SQLPatternTestParams(longASCIIString, "cas", false));
+ tests.add(new SQLPatternTestParams("S", "SQL", false));
+ tests.add(new SQLPatternTestParams(emptyString, "SQL", false));
+ tests.add(new SQLPatternTestParams(unicodeString, "¶T", true));
+ tests.add(new SQLPatternTestParams(unicodeString, "¶A", false));
+
+ // Tests for Matcher for patterns of length: 3 < length < 10
+ tests.add(new SQLPatternTestParams(longASCIIString, "MongoDB", true));
+ tests.add(new SQLPatternTestParams(longASCIIString, "MongoDz", false));
+ tests.add(new SQLPatternTestParams("Mon", "MongoDB", false));
+ tests.add(new SQLPatternTestParams(emptyString, "MongoDB", false));
+ tests.add(new SQLPatternTestParams(unicodeString, "®i¶", true));
+ tests.add(new SQLPatternTestParams(unicodeString, "®x¶", false));
+
+ // Tests for Matcher for patterns of length >= 10
+ tests.add(new SQLPatternTestParams(longASCIIString, "multiple datastores", true));
+ tests.add(new SQLPatternTestParams(longASCIIString, "multiple datastorb", false));
+ tests.add(new SQLPatternTestParams("multiple", "multiple datastores", false));
+ tests.add(new SQLPatternTestParams(emptyString, "multiple datastores", false));
+ tests.add(new SQLPatternTestParams(unicodeString, "¶T¤¤¤ß3¼", true));
+ tests.add(new SQLPatternTestParams(unicodeString, "¶T¤¤¤ßz¼", false));
+
+ for (SQLPatternTestParams test : tests) {
+ setDrillBuf(test.inputString);
+
+ RegexpUtil.SqlPatternInfo patternInfo = new RegexpUtil.SqlPatternInfo(RegexpUtil.SqlPatternType.CONTAINS, "", test.patternString);
+ SqlPatternMatcher sqlPatternContains = SqlPatternFactory.getSqlPatternMatcher(patternInfo);
+ int eval = sqlPatternContains.match(0, byteBuffer.limit(), drillBuf);
+ int expectedEval = test.shouldMatch ? 1 : 0;
+
+ if (eval != expectedEval) {
+ System.err.format("test failed; params=%s%n", test);
+ }
+
+ assertEquals(expectedEval, eval);
+ }
+ }
+
+
@After
public void cleanup() {
drillBuf.close();
allocator.close();
}
+
+ // -------------
+ // Inner Classes
+ // -------------
+
+ /** Container class to hold SQL pattern test data */
+ private static class SQLPatternTestParams {
+ private final String inputString;
+ private final String patternString;
+ private final boolean shouldMatch;
+
+ private SQLPatternTestParams(String inputString, String patternString, boolean shouldMatch) {
+ this.inputString = inputString;
+ this.patternString = patternString;
+ this.shouldMatch = shouldMatch;
+ }
+
+ public String toString() {
+ return "input=["+inputString+"], pattern=["+patternString+"], should-match=["+shouldMatch+"]..";
+ }
+ }
}