You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by rx...@apache.org on 2015/07/30 22:09:46 UTC
spark git commit: [SPARK-9460] Fix prefix generation for UTF8String.

Repository: spark
Updated Branches:
  refs/heads/master 6d94bf6ac -> a20e743fb


[SPARK-9460] Fix prefix generation for UTF8String.

Previously we could be getting garbage data if the number of bytes is 0, or on JVMs that are 4 byte aligned, or when compressedoops is on.

Author: Reynold Xin <rx...@databricks.com>

Closes #7789 from rxin/utf8string and squashes the following commits:

86ffa3e [Reynold Xin] Mask out data outside of valid range.
4d647ed [Reynold Xin] Mask out data.
c6e8794 [Reynold Xin] [SPARK-9460] Fix prefix generation for UTF8String.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a20e743f
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a20e743f
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a20e743f

Branch: refs/heads/master
Commit: a20e743fb863de809863652931bc982aac2d1f86
Parents: 6d94bf6
Author: Reynold Xin <rx...@databricks.com>
Authored: Thu Jul 30 13:09:43 2015 -0700
Committer: Reynold Xin <rx...@databricks.com>
Committed: Thu Jul 30 13:09:43 2015 -0700

----------------------------------------------------------------------
 .../apache/spark/unsafe/types/UTF8String.java   | 36 ++++++++++++++++++--
 .../spark/unsafe/types/UTF8StringSuite.java     |  8 +++++
 2 files changed, 41 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/a20e743f/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
----------------------------------------------------------------------
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index 5752200..c38953f 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -66,6 +66,19 @@ public final class UTF8String implements Comparable<UTF8String>, Serializable {
   }
 
   /**
+   * Creates an UTF8String from byte array, which should be encoded in UTF-8.
+   *
+   * Note: `bytes` will be hold by returned UTF8String.
+   */
+  public static UTF8String fromBytes(byte[] bytes, int offset, int numBytes) {
+    if (bytes != null) {
+      return new UTF8String(bytes, BYTE_ARRAY_OFFSET + offset, numBytes);
+    } else {
+      return null;
+    }
+  }
+
+  /**
    * Creates an UTF8String from String.
    */
   public static UTF8String fromString(String str) {
@@ -89,10 +102,10 @@ public final class UTF8String implements Comparable<UTF8String>, Serializable {
     return fromBytes(spaces);
   }
 
-  protected UTF8String(Object base, long offset, int size) {
+  protected UTF8String(Object base, long offset, int numBytes) {
     this.base = base;
     this.offset = offset;
-    this.numBytes = size;
+    this.numBytes = numBytes;
   }
 
   /**
@@ -141,7 +154,24 @@ public final class UTF8String implements Comparable<UTF8String>, Serializable {
    * Returns a 64-bit integer that can be used as the prefix used in sorting.
    */
   public long getPrefix() {
-    long p = PlatformDependent.UNSAFE.getLong(base, offset);
+    // Since JVMs are either 4-byte aligned or 8-byte aligned, we check the size of the string.
+    // If size is 0, just return 0.
+    // If size is between 0 and 4 (inclusive), assume data is 4-byte aligned under the hood and
+    // use a getInt to fetch the prefix.
+    // If size is greater than 4, assume we have at least 8 bytes of data to fetch.
+    // After getting the data, we use a mask to mask out data that is not part of the string.
+    long p;
+    if (numBytes >= 8) {
+      p = PlatformDependent.UNSAFE.getLong(base, offset);
+    } else  if (numBytes > 4) {
+      p = PlatformDependent.UNSAFE.getLong(base, offset);
+      p = p & ((1L << numBytes * 8) - 1);
+    } else if (numBytes > 0) {
+      p = (long) PlatformDependent.UNSAFE.getInt(base, offset);
+      p = p & ((1L << numBytes * 8) - 1);
+    } else {
+      p = 0;
+    }
     p = java.lang.Long.reverseBytes(p);
     return p;
   }

http://git-wip-us.apache.org/repos/asf/spark/blob/a20e743f/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
----------------------------------------------------------------------
diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
index 42e09e4..f2cc19c 100644
--- a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
+++ b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
@@ -71,6 +71,14 @@ public class UTF8StringSuite {
       fromString("abbbbbbbbbbbasdf").getPrefix() - fromString("bbbbbbbbbbbbasdf").getPrefix() < 0);
     assertTrue(fromString("").getPrefix() - fromString("a").getPrefix() < 0);
     assertTrue(fromString("你好").getPrefix() - fromString("世界").getPrefix() > 0);
+
+    byte[] buf1 = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+    byte[] buf2 = {1, 2, 3};
+    UTF8String str1 = UTF8String.fromBytes(buf1, 0, 3);
+    UTF8String str2 = UTF8String.fromBytes(buf1, 0, 8);
+    UTF8String str3 = UTF8String.fromBytes(buf2);
+    assertTrue(str1.getPrefix() - str2.getPrefix() < 0);
+    assertEquals(str1.getPrefix(), str3.getPrefix());
   }
 
   @Test


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org