You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@drill.apache.org by am...@apache.org on 2018/01/24 18:35:19 UTC

[05/11] drill git commit: DRILL-5879: Improved SQL Pattern Contains Performance

DRILL-5879: Improved SQL Pattern Contains Performance

close apache/drill#1072


Project: http://git-wip-us.apache.org/repos/asf/drill/repo
Commit: http://git-wip-us.apache.org/repos/asf/drill/commit/2420b35d
Tree: http://git-wip-us.apache.org/repos/asf/drill/tree/2420b35d
Diff: http://git-wip-us.apache.org/repos/asf/drill/diff/2420b35d

Branch: refs/heads/master
Commit: 2420b35d716a35a10be91ea82ec17bdeb392a77c
Parents: 48623ea
Author: Salim Achouche <sa...@gmail.com>
Authored: Wed Dec 13 14:24:45 2017 -0800
Committer: Aman Sinha <as...@maprtech.com>
Committed: Tue Jan 23 17:37:17 2018 -0800

----------------------------------------------------------------------
 .../expr/fn/impl/SqlPatternContainsMatcher.java | 274 ++++++++++++++++++-
 .../exec/expr/fn/impl/TestSqlPatterns.java      | 112 +++++++-
 2 files changed, 363 insertions(+), 23 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/drill/blob/2420b35d/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/SqlPatternContainsMatcher.java
----------------------------------------------------------------------
diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/SqlPatternContainsMatcher.java b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/SqlPatternContainsMatcher.java
index 04f5dac..ec95349 100644
--- a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/SqlPatternContainsMatcher.java
+++ b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/SqlPatternContainsMatcher.java
@@ -19,44 +19,292 @@ package org.apache.drill.exec.expr.fn.impl;
 
 import io.netty.buffer.DrillBuf;
 
-public class SqlPatternContainsMatcher extends AbstractSqlPatternMatcher {
+/** SQL Pattern Contains implementation */
+public final class SqlPatternContainsMatcher extends AbstractSqlPatternMatcher {
+  private final MatcherFcn matcherFcn;
 
   public SqlPatternContainsMatcher(String patternString) {
     super(patternString);
+
+    // Pattern matching is 1) a CPU intensive operation and 2) pattern and input dependent. The conclusion is
+    // that there is no single implementation that can do it all well. So, we use multiple implementations
+    // chosen based on the pattern length.
+    if (patternLength == 0) {
+      matcherFcn = new MatcherZero();
+    } else if (patternLength == 1) {
+      matcherFcn = new MatcherOne();
+    } else if (patternLength == 2) {
+      matcherFcn = new MatcherTwo();
+    } else if (patternLength == 3) {
+      matcherFcn = new MatcherThree();
+    } else if (patternLength < 10) {
+      matcherFcn = new MatcherN();
+    } else {
+      matcherFcn = new BoyerMooreMatcher();
+    }
   }
 
   @Override
   public int match(int start, int end, DrillBuf drillBuf) {
+    return matcherFcn.match(start, end, drillBuf);
+  }
+
+  //--------------------------------------------------------------------------
+  // Inner Data Structure
+  // --------------------------------------------------------------------------
+
+  /** Abstract matcher class to allow us pick the most efficient implementation */
+  private abstract class MatcherFcn {
+    protected final byte[] patternArray;
+
+    protected MatcherFcn() {
+      assert patternByteBuffer.hasArray();
+
+      patternArray = patternByteBuffer.array();
+    }
+
+    /**
+     * @return 1 if the pattern was matched; 0 otherwise
+     */
+    protected abstract int match(int start, int end, DrillBuf drillBuf);
+  }
+
+  /** Handles patterns with length zero */
+  private final class MatcherZero extends MatcherFcn {
 
-    if (patternLength == 0) { // Everything should match for null pattern string
+    private MatcherZero() {
+    }
+
+    /** {@inheritDoc} */
+    @Override
+    protected final int match(int start, int end, DrillBuf drillBuf) {
       return 1;
     }
+  }
+
+  /** Handles patterns with length one */
+  private final class MatcherOne extends MatcherFcn {
+    final byte firstPatternByte;
+
+    private MatcherOne() {
+      firstPatternByte  = patternArray[0];
+    }
+
+    /** {@inheritDoc} */
+    @Override
+    protected final int match(int start, int end, DrillBuf drillBuf) {
+      final int lengthToProcess = end - start;
+
+      // simplePattern string has meta characters i.e % and _ and escape characters removed.
+      // so, we can just directly compare.
+      for (int idx = 0; idx < lengthToProcess; idx++) {
+        byte inputByte = drillBuf.getByte(start + idx);
+
+        if (firstPatternByte != inputByte) {
+          continue;
+        }
+        return 1;
+      }
+      return 0;
+    }
+  }
+
+  /** Handles patterns with length two */
+  private final class MatcherTwo extends MatcherFcn {
+    final byte firstPatternByte;
+    final byte secondPatternByte;
+
+    private MatcherTwo() {
+      firstPatternByte  = patternArray[0];
+      secondPatternByte = patternArray[1];
+    }
+
+    /** {@inheritDoc} */
+    @Override
+    protected final int match(int start, int end, DrillBuf drillBuf) {
+      final int lengthToProcess = end - start - 1;
+
+      // simplePattern string has meta characters i.e % and _ and escape characters removed.
+      // so, we can just directly compare.
+      for (int idx = 0; idx < lengthToProcess; idx++) {
+        final byte firstInByte = drillBuf.getByte(start + idx);
 
-    final int txtLength = end - start;
+        if (firstPatternByte != firstInByte) {
+          continue;
+        } else {
+          final byte secondInByte = drillBuf.getByte(start + idx + 1);
 
-    // no match if input string length is less than pattern length
-    if (txtLength < patternLength) {
+          if (secondInByte == secondPatternByte) {
+            return 1;
+          }
+        }
+      }
       return 0;
     }
+  }
 
+  /** Handles patterns with length three */
+  private final class MatcherThree extends MatcherFcn {
+    final byte firstPatternByte;
+    final byte secondPatternByte;
+    final byte thirdPatternByte;
 
-    final int outerEnd = txtLength - patternLength;
+    private MatcherThree() {
+      firstPatternByte   = patternArray[0];
+      secondPatternByte  = patternArray[1];
+      thirdPatternByte   = patternArray[2];
+    }
 
-    outer:
-    for (int txtIndex = 0; txtIndex <= outerEnd; txtIndex++) {
+    /** {@inheritDoc} */
+    @Override
+    protected final int match(int start, int end, DrillBuf drillBuf) {
+      final int lengthToProcess = end - start - 2;
 
       // simplePattern string has meta characters i.e % and _ and escape characters removed.
       // so, we can just directly compare.
-      for (int patternIndex = 0; patternIndex < patternLength; patternIndex++) {
-        if (patternByteBuffer.get(patternIndex) != drillBuf.getByte(start + txtIndex + patternIndex)) {
-          continue outer;
+      for (int idx = 0; idx < lengthToProcess; idx++) {
+        final byte inputByte = drillBuf.getByte(start + idx);
+
+        if (firstPatternByte != inputByte) {
+          continue;
+        } else {
+          final byte secondInByte = drillBuf.getByte(start + idx + 1);
+          final byte thirdInByte  = drillBuf.getByte(start + idx + 2);
+
+          if (secondInByte == secondPatternByte && thirdInByte == thirdPatternByte) {
+            return 1;
+          }
         }
       }
+      return 0;
+    }
+  }
 
-      return 1;
+  /** Handles patterns with arbitrary length */
+  private final class MatcherN extends MatcherFcn {
+    final byte firstPatternByte;
+
+    private MatcherN() {
+      firstPatternByte = patternArray[0];
+    }
+
+    /** {@inheritDoc} */
+    @Override
+    protected final int match(int start, int end, DrillBuf drillBuf) {
+      final int lengthToProcess = end - start - patternLength + 1;
+      int patternIndex          = 0;
+
+      // simplePattern string has meta characters i.e % and _ and escape characters removed.
+      // so, we can just directly compare.
+      for (int idx = 0; idx < lengthToProcess; idx++) {
+        final byte inputByte = drillBuf.getByte(start + idx);
+
+        if (firstPatternByte == inputByte) {
+          for (patternIndex = 1; patternIndex < patternLength; ++patternIndex) {
+            final byte currInByte   = drillBuf.getByte(start + idx + patternIndex);
+            final byte currPattByte = patternArray[patternIndex];
+
+            if (currInByte != currPattByte) {
+              break;
+            }
+          }
+
+          if (patternIndex == patternLength) {
+            return 1;
+          }
+        }
+      }
+      return 0;
+    }
+  }
+
+  /**
+   * Boyer-Moore matcher algorithm; excellent for large patterns and for prefix patterns which appear
+   * frequently in the input.
+   */
+  private final class BoyerMooreMatcher extends MatcherFcn {
+    private final int[] offsetTable;
+    private final int[] characterTable;
+
+    private BoyerMooreMatcher() {
+      super();
+
+      this.offsetTable    = makeOffsetTable();
+      this.characterTable = makeCharTable();
+    }
+
+    /** {@inheritDoc} */
+    @Override
+    protected int match(int start, int end, DrillBuf drillBuf)  {
+      final int inputLength = end - start;
+
+      for (int idx1 = patternLength - 1, idx2; idx1 < inputLength;) {
+        for (idx2 = patternLength - 1; patternArray[idx2] == drillBuf.getByte(start + idx1); --idx1, --idx2) {
+          if (idx2 == 0) {
+            return 1;
+          }
+        }
+        // idx1 += pattern.length - idx2; // For naive method
+        idx1 += Math.max(offsetTable[patternLength - 1 - idx2], characterTable[drillBuf.getByte(start + idx1) & 0xFF]);
+      }
+      return 0;
+    }
+
+    /** Build the jump table based on the mismatched character information **/
+    private int[] makeCharTable() {
+      final int TABLE_SIZE = 256; // This implementation is based on byte comparison
+      int[] resultTable    = new int[TABLE_SIZE];
+
+      for (int idx = 0; idx < resultTable.length; ++idx) {
+        resultTable[idx] = patternLength;
+      }
+
+      for (int idx = 0; idx < patternLength - 1; ++idx) {
+        final int patternValue    = ((int) patternArray[idx]) & 0xFF;
+        resultTable[patternValue] = patternLength - 1 - idx;
+      }
+
+      return resultTable;
+    }
+
+    /** Builds the scan offset based on which mismatch occurs. **/
+    private int[] makeOffsetTable() {
+      int[] resultTable      = new int[patternLength];
+      int lastPrefixPosition = patternLength;
+
+      for (int idx = patternLength - 1; idx >= 0; --idx) {
+        if (isPrefix(idx + 1)) {
+          lastPrefixPosition = idx + 1;
+        }
+        resultTable[patternLength - 1 - idx] = lastPrefixPosition - idx + patternLength - 1;
+      }
+
+      for (int idx = 0; idx < patternLength - 1; ++idx) {
+        int suffixLen          = suffixLength(idx);
+        resultTable[suffixLen] = patternLength - 1 - idx + suffixLen;
+      }
+
+      return resultTable;
     }
 
-    return  0;
+    /** Checks whether needle[pos:end] is a prefix of pattern **/
+    private boolean isPrefix(int pos) {
+      for (int idx1 = pos, idx2 = 0; idx1 < patternLength; ++idx1, ++idx2) {
+        if (patternArray[idx1] != patternArray[idx2]) {
+          return false;
+        }
+      }
+      return true;
+    }
+
+    /** Computes the maximum length of the substring ends at "pos" and is a suffix **/
+    private int suffixLength(int pos) {
+      int result = 0;
+      for (int idx1 = pos, idx2 = patternLength - 1; idx1 >= 0 && patternArray[idx1] == patternArray[idx2]; --idx1, --idx2) {
+        result += 1;
+      }
+      return result;
+    }
   }
 
 }

http://git-wip-us.apache.org/repos/asf/drill/blob/2420b35d/exec/java-exec/src/test/java/org/apache/drill/exec/expr/fn/impl/TestSqlPatterns.java
----------------------------------------------------------------------
diff --git a/exec/java-exec/src/test/java/org/apache/drill/exec/expr/fn/impl/TestSqlPatterns.java b/exec/java-exec/src/test/java/org/apache/drill/exec/expr/fn/impl/TestSqlPatterns.java
index 2eecb54..7d85719 100644
--- a/exec/java-exec/src/test/java/org/apache/drill/exec/expr/fn/impl/TestSqlPatterns.java
+++ b/exec/java-exec/src/test/java/org/apache/drill/exec/expr/fn/impl/TestSqlPatterns.java
@@ -17,23 +17,25 @@
  */
 
 package org.apache.drill.exec.expr.fn.impl;
-import io.netty.buffer.DrillBuf;
-import org.apache.drill.common.exceptions.DrillRuntimeException;
-import org.apache.drill.exec.memory.BufferAllocator;
-import org.apache.drill.exec.memory.RootAllocatorFactory;
-import org.junit.After;
-import org.junit.Before;
-import org.junit.Ignore;
-import org.junit.Test;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
 
 import java.nio.ByteBuffer;
 import java.nio.CharBuffer;
 import java.nio.charset.CharacterCodingException;
 import java.nio.charset.Charset;
 import java.nio.charset.CharsetEncoder;
+import java.util.ArrayList;
+import java.util.List;
 
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
+import org.apache.drill.common.exceptions.DrillRuntimeException;
+import org.apache.drill.exec.memory.BufferAllocator;
+import org.apache.drill.exec.memory.RootAllocatorFactory;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+import io.netty.buffer.DrillBuf;
 
 public class TestSqlPatterns {
   BufferAllocator allocator;
@@ -446,10 +448,100 @@ public class TestSqlPatterns {
     assertEquals(1, sqlPatternComplex.match(0, byteBuffer.limit(), drillBuf)); // should match
   }
 
+  @Test
+  public void testSqlPatternContainsMUltipleMatchers() {
+
+    final String longASCIIString = "Drill supports a variety of NoSQL databases and file systems, including HBase, MongoDB, MapR-DB, HDFS, MapR-FS, Amazon S3, Azure Blob Storage, Google Cloud Storage, Swift, "
+      + "NAS and local files. A single query can join data from multiple datastores. For example, you can join a user profile collection in MongoDB with a directory of event logs in Hadoop.";
+    final String emptyString     = "";
+    final String unicodeString   = "¤EÀsÆW°ê»Ú®i¶T¤¤¤ß3¼Ó®i¶TÆU2~~";
+
+    final List<SQLPatternTestParams> tests = new ArrayList<SQLPatternTestParams>();
+
+    // Tests for Matcher ZERO
+    tests.add(new SQLPatternTestParams(longASCIIString, "", true));
+    tests.add(new SQLPatternTestParams(emptyString,     "", true));
+    tests.add(new SQLPatternTestParams(unicodeString,   "", true));
+
+    // Tests for Matcher ONE
+    tests.add(new SQLPatternTestParams(longASCIIString, "N", true));
+    tests.add(new SQLPatternTestParams(longASCIIString, "&", false));
+    tests.add(new SQLPatternTestParams(emptyString,     "N", false));
+
+    // Tests for Matcher TWO
+    tests.add(new SQLPatternTestParams(longASCIIString, "SQ", true));
+    tests.add(new SQLPatternTestParams(longASCIIString, "eT", false));
+    tests.add(new SQLPatternTestParams("A",             "SQ", false));
+    tests.add(new SQLPatternTestParams(emptyString,     "SQ", false));
+    tests.add(new SQLPatternTestParams(unicodeString,   "¶",  true));
+    tests.add(new SQLPatternTestParams(unicodeString,   "AT", false));
+
+    // Tests for Matcher THREE
+    tests.add(new SQLPatternTestParams(longASCIIString, "SQL", true));
+    tests.add(new SQLPatternTestParams(longASCIIString, "cas", false));
+    tests.add(new SQLPatternTestParams("S",             "SQL", false));
+    tests.add(new SQLPatternTestParams(emptyString,     "SQL", false));
+    tests.add(new SQLPatternTestParams(unicodeString,   "¶T", true));
+    tests.add(new SQLPatternTestParams(unicodeString,   "¶A", false));
+
+    // Tests for Matcher for patterns of length: 3 < length < 10
+    tests.add(new SQLPatternTestParams(longASCIIString, "MongoDB", true));
+    tests.add(new SQLPatternTestParams(longASCIIString, "MongoDz", false));
+    tests.add(new SQLPatternTestParams("Mon",           "MongoDB", false));
+    tests.add(new SQLPatternTestParams(emptyString,     "MongoDB", false));
+    tests.add(new SQLPatternTestParams(unicodeString,   "®i¶", true));
+    tests.add(new SQLPatternTestParams(unicodeString,   "®x¶", false));
+
+    // Tests for Matcher for patterns of length >= 10
+    tests.add(new SQLPatternTestParams(longASCIIString, "multiple datastores", true));
+    tests.add(new SQLPatternTestParams(longASCIIString, "multiple datastorb",  false));
+    tests.add(new SQLPatternTestParams("multiple",      "multiple datastores", false));
+    tests.add(new SQLPatternTestParams(emptyString,     "multiple datastores", false));
+    tests.add(new SQLPatternTestParams(unicodeString,   "¶T¤¤¤ß3¼", true));
+    tests.add(new SQLPatternTestParams(unicodeString,   "¶T¤¤¤ßz¼", false));
+
+    for (SQLPatternTestParams test : tests) {
+      setDrillBuf(test.inputString);
+
+      RegexpUtil.SqlPatternInfo patternInfo = new RegexpUtil.SqlPatternInfo(RegexpUtil.SqlPatternType.CONTAINS, "", test.patternString);
+      SqlPatternMatcher sqlPatternContains  = SqlPatternFactory.getSqlPatternMatcher(patternInfo);
+      int eval                              = sqlPatternContains.match(0, byteBuffer.limit(), drillBuf);
+      int expectedEval                      = test.shouldMatch ? 1 : 0;
+
+      if (eval != expectedEval) {
+        System.err.format("test failed; params=%s%n", test);
+      }
+
+      assertEquals(expectedEval, eval);
+    }
+  }
+
+
   @After
   public void cleanup() {
     drillBuf.close();
     allocator.close();
   }
+
+  // -------------
+  // Inner Classes
+  // -------------
+
+  /** Container class to hold SQL pattern test data */
+  private static class SQLPatternTestParams {
+    private final String inputString;
+    private final String patternString;
+    private final boolean shouldMatch;
+
+    private SQLPatternTestParams(String inputString, String patternString, boolean shouldMatch) {
+      this.inputString   = inputString;
+      this.patternString = patternString;
+      this.shouldMatch   = shouldMatch;
+    }
+
+    public String toString() {
+      return "input=["+inputString+"], pattern=["+patternString+"], should-match=["+shouldMatch+"]..";
+    }
+  }
 }