You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@drill.apache.org by jn...@apache.org on 2016/06/16 03:37:40 UTC

drill git commit: DRILL-4573: Fixed issue with string functions when input contains non-ASCII characters.

Repository: drill
Updated Branches:
  refs/heads/master c2d9959e0 -> 2e2b54af5


DRILL-4573: Fixed issue with string functions when input contains non-ASCII characters.

Close apache/drill#512


Project: http://git-wip-us.apache.org/repos/asf/drill/repo
Commit: http://git-wip-us.apache.org/repos/asf/drill/commit/2e2b54af
Tree: http://git-wip-us.apache.org/repos/asf/drill/tree/2e2b54af
Diff: http://git-wip-us.apache.org/repos/asf/drill/diff/2e2b54af

Branch: refs/heads/master
Commit: 2e2b54af5379f7046e84390e70f3862cddd93195
Parents: c2d9959
Author: jean-claude cote <jc...@gmail.com>
Authored: Sat May 14 18:02:57 2016 -0400
Committer: Jinfeng Ni <jn...@apache.org>
Committed: Wed Jun 15 17:46:56 2016 -0700

----------------------------------------------------------------------
 .../exec/expr/fn/impl/CharSequenceWrapper.java  | 204 +++++++++++++++++--
 .../exec/expr/fn/impl/StringFunctions.java      |  28 ++-
 .../exec/expr/fn/impl/TestStringFunctions.java  |  26 +++
 3 files changed, 234 insertions(+), 24 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/drill/blob/2e2b54af/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/CharSequenceWrapper.java
----------------------------------------------------------------------
diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/CharSequenceWrapper.java b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/CharSequenceWrapper.java
index 6c475ed..d085494 100644
--- a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/CharSequenceWrapper.java
+++ b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/CharSequenceWrapper.java
@@ -6,9 +6,7 @@
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
- *
  * http://www.apache.org/licenses/LICENSE-2.0
- *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -17,33 +15,203 @@
  */
 package org.apache.drill.exec.expr.fn.impl;
 
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.CharacterCodingException;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CoderResult;
+import java.util.regex.Matcher;
+
 import io.netty.buffer.DrillBuf;
 
+/**
+ * A CharSequence is a readable sequence of char values. This interface provides
+ * uniform, read-only access to many different kinds of char sequences. A char
+ * value represents a character in the Basic Multilingual Plane (BMP) or a
+ * surrogate. Refer to Unicode Character Representation for details.<br>
+ * Specifically this implementation of the CharSequence adapts a Drill
+ * {@link DrillBuf} to the CharSequence. The implementation is meant to be
+ * re-used that is allocated once and then passed DrillBuf to adapt. This can be
+ * handy to exploit API that consume CharSequence avoiding the need to create
+ * string objects.
+ *
+ */
 public class CharSequenceWrapper implements CharSequence {
 
-    private int start;
-    private int end;
-    private DrillBuf buffer;
+  // The adapted drill buffer (in the case of US-ASCII)
+  private DrillBuf buffer;
+  // The converted bytes in the case of non ASCII
+  private CharBuffer charBuffer;
+  // initial char buffer capacity
+  private static final int INITIAL_CHAR_BUF = 1024;
+  // The decoder to use in the case of non ASCII
+  private CharsetDecoder decoder;
+
+  // The start offset into the drill buffer
+  private int start;
+  // The end offset into the drill buffer
+  private int end;
+  // Indicates that the current byte buffer contains only ascii chars
+  private boolean usAscii;
+
+  public CharSequenceWrapper() {
+  }
+
+  public CharSequenceWrapper(int start, int end, DrillBuf buffer) {
+    setBuffer(start, end, buffer);
+  }
 
-    @Override
-    public int length() {
-        return end - start;
+  @Override
+  public int length() {
+    return end - start;
+  }
+
+  @Override
+  public char charAt(int index) {
+    if (usAscii) {
+      // Each byte is a char, the index is relative to the start of the original buffer
+      return (char) (buffer.getByte(start + index) & 0x00FF);
+    } else {
+      // The char buffer is a copy so the index directly corresponds
+      return charBuffer.charAt(index);
     }
+  }
+
+  /**
+   * When using the Java regex {@link Matcher} the subSequence is only called
+   * when capturing groups. Drill does not currently use capture groups in the
+   * UDF so this method is not required.<br>
+   * It could be implemented by creating a new CharSequenceWrapper however
+   * this would imply newly allocated objects which is what this wrapper tries
+   * to avoid.
+   *
+   */
+  @Override
+  public CharSequence subSequence(int start, int end) {
+    throw new UnsupportedOperationException();
+  }
+
+  /**
+   * Set the DrillBuf to adapt to a CharSequence. This method can be used to
+   * replace any previous DrillBuf thus avoiding recreating the
+   * CharSequenceWrapper and thus re-using the CharSequenceWrapper object.
+   *
+   * @param start
+   * @param end
+   * @param buffer
+   */
+  public void setBuffer(int start, int end, DrillBuf buffer) {
+    // Test if buffer is an ASCII string or not.
+    usAscii = isAscii(start, end, buffer);
 
-    @Override
-    public char charAt(int index) {
-        return (char) buffer.getByte(start + index);
+    if (usAscii) {
+      // each byte equals one char
+      this.start = start;
+      this.end = end;
+      this.buffer = buffer;
+    } else {
+      initCharBuffer();
+      // Wrap with java byte buffer
+      ByteBuffer byteBuf = buffer.nioBuffer(start, end - start);
+      while (charBuffer.capacity() < Integer.MAX_VALUE) {
+        byteBuf.mark();
+        if (decodeUT8(byteBuf)) {
+          break;
+        }
+        // Failed to convert because the char buffer was not large enough
+        growCharBuffer();
+        // Make sure to reset the byte buffer we need to reprocess it
+        byteBuf.reset();
+      }
+      this.start = 0;
+      this.end = charBuffer.position();
+      // reset the char buffer so the index are relative to the start of the buffer
+      charBuffer.rewind();
     }
+  }
 
-    @Override
-    public CharSequence subSequence(int start, int end) {
-        throw new UnsupportedOperationException("Not implemented.");
+  /**
+   * Test if the buffer contains only ASCII bytes.
+   * @param start
+   * @param end
+   * @param buffer
+   * @return
+   */
+  private boolean isAscii(int start, int end, DrillBuf buffer) {
+    for (int i = start; i < end; i++) {
+      byte bb = buffer.getByte(i);
+      if (bb < 0) {
+        //System.out.printf("Not a ASCII byte 0x%02X\n", bb);
+        return false;
+      }
     }
+    return true;
+  }
 
-    public void setBuffer(int start, int end, DrillBuf buffer) {
-        this.start = start;
-        this.end = end;
-        this.buffer = buffer;
+  /**
+   * Initialize the charbuffer and decoder if they are not yet initialized.
+   */
+  private void initCharBuffer() {
+    if (charBuffer == null) {
+      charBuffer = CharBuffer.allocate(INITIAL_CHAR_BUF);
     }
+    if (decoder == null) {
+      decoder = Charset.forName("UTF-8").newDecoder();
+    }
+  }
+
+  /**
+   * Decode the buffer using the CharsetDecoder.
+   * @param byteBuf
+   * @return false if failed because the charbuffer was not big enough
+   * @throws RuntimeException if it fails for encoding errors
+   */
+  private boolean decodeUT8(ByteBuffer byteBuf) {
+    // We give it all of the input data in call.
+    boolean endOfInput = true;
+    decoder.reset();
+    charBuffer.rewind();
+    // Convert utf-8 bytes to sequence of chars
+    CoderResult result = decoder.decode(byteBuf, charBuffer, endOfInput);
+    if (result.isOverflow()) {
+      // Not enough space in the charBuffer.
+      return false;
+    } else if (result.isError()) {
+      // Any other error
+      try {
+        result.throwException();
+      } catch (CharacterCodingException e) {
+        throw new RuntimeException(e);
+      }
+    }
+    return true;
+  }
+
+  /**
+   * Grow the charbuffer making sure not to overflow size integer. Note
+   * this grows in the same manner as the ArrayList that is it adds 50%
+   * to the current size.
+   */
+  private void growCharBuffer() {
+    // overflow-conscious code
+    int oldCapacity = charBuffer.capacity();
+    //System.out.println("old capacity " + oldCapacity);
+    int newCapacity = oldCapacity + (oldCapacity >> 1);
+    if (newCapacity < 0) {
+      newCapacity = Integer.MAX_VALUE;
+    }
+    //System.out.println("new capacity " + newCapacity);
+    charBuffer = CharBuffer.allocate(newCapacity);
+  }
+
+  /**
+   * The regexp_replace function is implemented in a way to avoid the call to toString()
+   * not to uselessly create a string object.
+   */
+  @Override
+  public String toString() {
+    throw new UnsupportedOperationException();
+  }
 
 }

http://git-wip-us.apache.org/repos/asf/drill/blob/2e2b54af/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctions.java
----------------------------------------------------------------------
diff --git a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctions.java b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctions.java
index 524d254..50ff435 100644
--- a/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctions.java
+++ b/exec/java-exec/src/main/java/org/apache/drill/exec/expr/fn/impl/StringFunctions.java
@@ -243,14 +243,30 @@ public class StringFunctions{
     public void eval() {
       out.start = 0;
       charSequenceWrapper.setBuffer(input.start, input.end, input.buffer);
+      final String r = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(replacement.start, replacement.end, replacement.buffer);
       // Reusing same charSequenceWrapper, no need to pass it in.
-      // This saves one method call since reset(CharSequence) calls reset()
       matcher.reset();
-      final String r = org.apache.drill.exec.expr.fn.impl.StringFunctionHelpers.toStringFromUTF8(replacement.start, replacement.end, replacement.buffer);
-      final byte [] bytea = matcher.replaceAll(r).getBytes(java.nio.charset.Charset.forName("UTF-8"));
-      out.buffer = buffer = buffer.reallocIfNeeded(bytea.length);
-      out.buffer.setBytes(out.start, bytea);
-      out.end = bytea.length;
+      // Implementation of Matcher.replaceAll() in-lined to avoid creating String object
+      // in cases where we don't actually replace anything.
+      boolean result = matcher.find();
+      if (result) {
+          StringBuffer sb = new StringBuffer();
+          do {
+              matcher.appendReplacement(sb, r);
+              result = matcher.find();
+          } while (result);
+          matcher.appendTail(sb);
+          final byte [] bytea = sb.toString().getBytes(java.nio.charset.Charset.forName("UTF-8"));
+          out.buffer = buffer = buffer.reallocIfNeeded(bytea.length);
+          out.buffer.setBytes(out.start, bytea);
+          out.end = bytea.length;
+      }
+      else {
+          // There is no matches, copy the input bytes into the output buffer
+          out.buffer = buffer = buffer.reallocIfNeeded(input.end - input.start);
+          out.buffer.setBytes(0, input.buffer, input.start, input.end - input.start);
+          out.end = input.end - input.start;
+      }
     }
   }
 

http://git-wip-us.apache.org/repos/asf/drill/blob/2e2b54af/exec/java-exec/src/test/java/org/apache/drill/exec/expr/fn/impl/TestStringFunctions.java
----------------------------------------------------------------------
diff --git a/exec/java-exec/src/test/java/org/apache/drill/exec/expr/fn/impl/TestStringFunctions.java b/exec/java-exec/src/test/java/org/apache/drill/exec/expr/fn/impl/TestStringFunctions.java
index 612408b..daedd1c 100644
--- a/exec/java-exec/src/test/java/org/apache/drill/exec/expr/fn/impl/TestStringFunctions.java
+++ b/exec/java-exec/src/test/java/org/apache/drill/exec/expr/fn/impl/TestStringFunctions.java
@@ -117,6 +117,32 @@ public class TestStringFunctions extends BaseTestQuery {
   }
 
   @Test
+  public void testRegexpMatchesNonAscii() throws Exception {
+    testBuilder()
+        .sqlQuery("select regexp_matches(a, 'M�nchen') res1, regexp_matches(b, 'AM�nchenA') res2 " +
+            "from (values('M�nchen', 'M�nchenA'), ('M�nchenA', 'AM�nchenA')) as t(a,b)")
+        .unOrdered()
+        .baselineColumns("res1", "res2")
+        .baselineValues(true, false)
+        .baselineValues(false, true)
+        .build()
+        .run();
+  }
+
+  @Test
+  public void testRegexpReplace() throws Exception {
+    testBuilder()
+        .sqlQuery("select regexp_replace(a, 'a|c', 'x') res1, regexp_replace(b, 'd', 'zzz') res2 " +
+                  "from (values('abc', 'bcd'), ('bcd', 'abc')) as t(a,b)")
+        .unOrdered()
+        .baselineColumns("res1", "res2")
+        .baselineValues("xbx", "bczzz")
+        .baselineValues("bxd", "abc")
+        .build()
+        .run();
+  }
+
+  @Test
   public void testILike() throws Exception {
     testBuilder()
         .sqlQuery("select n_name from cp.`tpch/nation.parquet` where ilike(n_name, '%united%') = true")