You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tajo.apache.org by ji...@apache.org on 2016/04/15 07:34:36 UTC

tajo git commit: TAJO-2119: Invalid sort result when sort key columns contain non-ascii values.

Repository: tajo
Updated Branches:
  refs/heads/master 36f691d9f -> 45100ced2


TAJO-2119: Invalid sort result when sort key columns contain non-ascii values.

Closes #998


Project: http://git-wip-us.apache.org/repos/asf/tajo/repo
Commit: http://git-wip-us.apache.org/repos/asf/tajo/commit/45100ced
Tree: http://git-wip-us.apache.org/repos/asf/tajo/tree/45100ced
Diff: http://git-wip-us.apache.org/repos/asf/tajo/diff/45100ced

Branch: refs/heads/master
Commit: 45100ced2c44e92eed1c24bc58a118e74f58d277
Parents: 36f691d
Author: Jihoon Son <ji...@apache.org>
Authored: Fri Apr 15 14:33:02 2016 +0900
Committer: Jihoon Son <ji...@apache.org>
Committed: Fri Apr 15 14:33:02 2016 +0900

----------------------------------------------------------------------
 CHANGES                                         |  3 +
 .../memory/UnSafeTupleBytesComparator.java      | 52 +++++++----------
 .../memory/TestUnSafeTupleBytesComparator.java  | 61 ++++++++++++++++++++
 3 files changed, 84 insertions(+), 32 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tajo/blob/45100ced/CHANGES
----------------------------------------------------------------------
diff --git a/CHANGES b/CHANGES
index cefb3df..7565faf 100644
--- a/CHANGES
+++ b/CHANGES
@@ -122,6 +122,9 @@ Release 0.12.0 - unreleased
 
   BUG FIXES
 
+    TAJO-2119: Invalid sort result when sort key columns contain non-ascii values. 
+    (jihoon)
+
     TAJO-2100: Add missing cancellation in defaultTaskScheduler when a worker is 
     no respond. (jinho)
 

http://git-wip-us.apache.org/repos/asf/tajo/blob/45100ced/tajo-common/src/main/java/org/apache/tajo/tuple/memory/UnSafeTupleBytesComparator.java
----------------------------------------------------------------------
diff --git a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/UnSafeTupleBytesComparator.java b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/UnSafeTupleBytesComparator.java
index 53a78a8..50c4345 100644
--- a/tajo-common/src/main/java/org/apache/tajo/tuple/memory/UnSafeTupleBytesComparator.java
+++ b/tajo-common/src/main/java/org/apache/tajo/tuple/memory/UnSafeTupleBytesComparator.java
@@ -19,6 +19,7 @@
 package org.apache.tajo.tuple.memory;
 
 import com.google.common.primitives.Longs;
+import com.google.common.primitives.UnsignedBytes;
 import com.google.common.primitives.UnsignedLongs;
 import org.apache.tajo.util.SizeOf;
 import org.apache.tajo.util.UnsafeUtil;
@@ -31,10 +32,10 @@ import java.nio.ByteOrder;
  */
 public class UnSafeTupleBytesComparator {
   private static final Unsafe UNSAFE = UnsafeUtil.unsafe;
+  private static final int UNSIGNED_MASK = 0xFF;
+  private static final boolean littleEndian = ByteOrder.nativeOrder().equals(ByteOrder.LITTLE_ENDIAN);
 
-  static final boolean littleEndian =
-      ByteOrder.nativeOrder().equals(ByteOrder.LITTLE_ENDIAN);
-
+  // This code is borrowed from Guava's UnsignedBytes:::compare()
   public static int compare(long ptr1, long ptr2) {
     int lstrLen = UNSAFE.getInt(ptr1);
     int rstrLen = UNSAFE.getInt(ptr2);
@@ -45,42 +46,29 @@ public class UnSafeTupleBytesComparator {
     int minLength = Math.min(lstrLen, rstrLen);
     int minWords = minLength / Longs.BYTES;
 
-        /*
-         * Compare 8 bytes at a time. Benchmarking shows comparing 8 bytes at a
-         * time is no slower than comparing 4 bytes at a time even on 32-bit.
-         * On the other hand, it is substantially faster on 64-bit.
-         */
+    /*
+     * Compare 8 bytes at a time. Benchmarking shows comparing 8 bytes at a
+     * time is no slower than comparing 4 bytes at a time even on 32-bit.
+     * On the other hand, it is substantially faster on 64-bit.
+    */
     for (int i = 0; i < minWords * Longs.BYTES; i += Longs.BYTES) {
       long lw = UNSAFE.getLong(ptr1);
       long rw = UNSAFE.getLong(ptr2);
-      long diff = lw ^ rw;
 
-      if (diff != 0) {
+      if (lw != rw) {
         if (!littleEndian) {
           return UnsignedLongs.compare(lw, rw);
         }
 
-        // Use binary search
-        int n = 0;
-        int y;
-        int x = (int) diff;
-        if (x == 0) {
-          x = (int) (diff >>> 32);
-          n = 32;
-        }
-
-        y = x << 16;
-        if (y == 0) {
-          n += 16;
-        } else {
-          x = y;
-        }
-
-        y = x << 8;
-        if (y == 0) {
-          n += 8;
-        }
-        return (int) (((lw >>> n) & 0xFFL) - ((rw >>> n) & 0xFFL));
+        /*
+         * We want to compare only the first index where left[index] != right[index].
+         * This corresponds to the least significant nonzero byte in lw ^ rw, since lw
+         * and rw are little-endian.  Long.numberOfTrailingZeros(diff) tells us the least
+         * significant nonzero bit, and zeroing out the first three bits of L.nTZ gives us the
+         * shift to get that least significant nonzero byte.
+        */
+        int n = Long.numberOfTrailingZeros(lw ^ rw) & ~0x7;
+        return (int) (((lw >>> n) & UNSIGNED_MASK) - ((rw >>> n) & UNSIGNED_MASK));
       }
 
       ptr1 += SizeOf.SIZE_OF_LONG;
@@ -89,7 +77,7 @@ public class UnSafeTupleBytesComparator {
 
     // The epilogue to cover the last (minLength % 8) elements.
     for (int i = minWords * Longs.BYTES; i < minLength; i++) {
-      int result = UNSAFE.getByte(ptr1++) - UNSAFE.getByte(ptr2++);
+      int result = UnsignedBytes.compare(UNSAFE.getByte(ptr1++), UNSAFE.getByte(ptr2++));
       if (result != 0) {
         return result;
       }

http://git-wip-us.apache.org/repos/asf/tajo/blob/45100ced/tajo-common/src/test/java/org/apache/tajo/tuple/memory/TestUnSafeTupleBytesComparator.java
----------------------------------------------------------------------
diff --git a/tajo-common/src/test/java/org/apache/tajo/tuple/memory/TestUnSafeTupleBytesComparator.java b/tajo-common/src/test/java/org/apache/tajo/tuple/memory/TestUnSafeTupleBytesComparator.java
new file mode 100644
index 0000000..b2bde40
--- /dev/null
+++ b/tajo-common/src/test/java/org/apache/tajo/tuple/memory/TestUnSafeTupleBytesComparator.java
@@ -0,0 +1,61 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tajo.tuple.memory;
+
+import org.apache.tajo.common.TajoDataTypes.DataType;
+import org.apache.tajo.common.TajoDataTypes.Type;
+import org.apache.tajo.tuple.RowBlockReader;
+import org.junit.Test;
+
+import static org.junit.Assert.assertTrue;
+
+public class TestUnSafeTupleBytesComparator {
+
+  @Test
+  public void testCompare() {
+    MemoryRowBlock rowBlock = null;
+    try {
+      rowBlock = new MemoryRowBlock(new DataType[]{
+          DataType.newBuilder().setType(Type.TEXT).build()
+      });
+      RowWriter builder = rowBlock.getWriter();
+      builder.startRow();
+      builder.putText("CÔTE D'IVOIRE");
+      builder.endRow();
+      builder.startRow();
+      builder.putText("CANADA");
+      builder.endRow();
+
+      RowBlockReader reader = rowBlock.getReader();
+
+      UnSafeTuple t1 = new UnSafeTuple();
+      UnSafeTuple t2 = new UnSafeTuple();
+      reader.next(t1);
+      reader.next(t2);
+
+      // 'CÔTE D'IVOIRE' should occur later than 'CANADA' in ascending order.
+      int compare = UnSafeTupleBytesComparator.compare(t1.getFieldAddr(0), t2.getFieldAddr(0));
+      assertTrue(compare > 0);
+    } finally {
+      if (rowBlock != null) {
+        rowBlock.release();
+      }
+    }
+  }
+}