You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by rx...@apache.org on 2015/07/30 22:09:46 UTC
spark git commit: [SPARK-9460] Fix prefix generation for UTF8String.
Repository: spark
Updated Branches:
refs/heads/master 6d94bf6ac -> a20e743fb
[SPARK-9460] Fix prefix generation for UTF8String.
Previously we could be getting garbage data if the number of bytes is 0, or on JVMs that are 4 byte aligned, or when compressedoops is on.
Author: Reynold Xin <rx...@databricks.com>
Closes #7789 from rxin/utf8string and squashes the following commits:
86ffa3e [Reynold Xin] Mask out data outside of valid range.
4d647ed [Reynold Xin] Mask out data.
c6e8794 [Reynold Xin] [SPARK-9460] Fix prefix generation for UTF8String.
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a20e743f
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a20e743f
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a20e743f
Branch: refs/heads/master
Commit: a20e743fb863de809863652931bc982aac2d1f86
Parents: 6d94bf6
Author: Reynold Xin <rx...@databricks.com>
Authored: Thu Jul 30 13:09:43 2015 -0700
Committer: Reynold Xin <rx...@databricks.com>
Committed: Thu Jul 30 13:09:43 2015 -0700
----------------------------------------------------------------------
.../apache/spark/unsafe/types/UTF8String.java | 36 ++++++++++++++++++--
.../spark/unsafe/types/UTF8StringSuite.java | 8 +++++
2 files changed, 41 insertions(+), 3 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/a20e743f/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
----------------------------------------------------------------------
diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index 5752200..c38953f 100644
--- a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -66,6 +66,19 @@ public final class UTF8String implements Comparable<UTF8String>, Serializable {
}
/**
+ * Creates an UTF8String from byte array, which should be encoded in UTF-8.
+ *
+ * Note: `bytes` will be hold by returned UTF8String.
+ */
+ public static UTF8String fromBytes(byte[] bytes, int offset, int numBytes) {
+ if (bytes != null) {
+ return new UTF8String(bytes, BYTE_ARRAY_OFFSET + offset, numBytes);
+ } else {
+ return null;
+ }
+ }
+
+ /**
* Creates an UTF8String from String.
*/
public static UTF8String fromString(String str) {
@@ -89,10 +102,10 @@ public final class UTF8String implements Comparable<UTF8String>, Serializable {
return fromBytes(spaces);
}
- protected UTF8String(Object base, long offset, int size) {
+ protected UTF8String(Object base, long offset, int numBytes) {
this.base = base;
this.offset = offset;
- this.numBytes = size;
+ this.numBytes = numBytes;
}
/**
@@ -141,7 +154,24 @@ public final class UTF8String implements Comparable<UTF8String>, Serializable {
* Returns a 64-bit integer that can be used as the prefix used in sorting.
*/
public long getPrefix() {
- long p = PlatformDependent.UNSAFE.getLong(base, offset);
+ // Since JVMs are either 4-byte aligned or 8-byte aligned, we check the size of the string.
+ // If size is 0, just return 0.
+ // If size is between 0 and 4 (inclusive), assume data is 4-byte aligned under the hood and
+ // use a getInt to fetch the prefix.
+ // If size is greater than 4, assume we have at least 8 bytes of data to fetch.
+ // After getting the data, we use a mask to mask out data that is not part of the string.
+ long p;
+ if (numBytes >= 8) {
+ p = PlatformDependent.UNSAFE.getLong(base, offset);
+ } else if (numBytes > 4) {
+ p = PlatformDependent.UNSAFE.getLong(base, offset);
+ p = p & ((1L << numBytes * 8) - 1);
+ } else if (numBytes > 0) {
+ p = (long) PlatformDependent.UNSAFE.getInt(base, offset);
+ p = p & ((1L << numBytes * 8) - 1);
+ } else {
+ p = 0;
+ }
p = java.lang.Long.reverseBytes(p);
return p;
}
http://git-wip-us.apache.org/repos/asf/spark/blob/a20e743f/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
----------------------------------------------------------------------
diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
index 42e09e4..f2cc19c 100644
--- a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
+++ b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
@@ -71,6 +71,14 @@ public class UTF8StringSuite {
fromString("abbbbbbbbbbbasdf").getPrefix() - fromString("bbbbbbbbbbbbasdf").getPrefix() < 0);
assertTrue(fromString("").getPrefix() - fromString("a").getPrefix() < 0);
assertTrue(fromString("你好").getPrefix() - fromString("世界").getPrefix() > 0);
+
+ byte[] buf1 = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+ byte[] buf2 = {1, 2, 3};
+ UTF8String str1 = UTF8String.fromBytes(buf1, 0, 3);
+ UTF8String str2 = UTF8String.fromBytes(buf1, 0, 8);
+ UTF8String str3 = UTF8String.fromBytes(buf2);
+ assertTrue(str1.getPrefix() - str2.getPrefix() < 0);
+ assertEquals(str1.getPrefix(), str3.getPrefix());
}
@Test
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org