You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@iceberg.apache.org by bl...@apache.org on 2019/04/02 21:36:14 UTC

[incubator-iceberg] branch master updated: Use big-endian byte order for UUIDs in Conversions (#135)

This is an automated email from the ASF dual-hosted git repository.

blue pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-iceberg.git


The following commit(s) were added to refs/heads/master by this push:
     new c53d4be  Use big-endian byte order for UUIDs in Conversions (#135)
c53d4be is described below

commit c53d4bee440ed8671b68a74f7c71662d14561359
Author: Anton Okolnychyi <ao...@apple.com>
AuthorDate: Tue Apr 2 22:36:09 2019 +0100

    Use big-endian byte order for UUIDs in Conversions (#135)
---
 .../java/org/apache/iceberg/types/Conversions.java |  11 +-
 .../org/apache/iceberg/types/TestConversions.java  | 123 +++++++++++++++++++++
 .../apache/iceberg/parquet/TestParquetMetrics.java |   5 +-
 3 files changed, 133 insertions(+), 6 deletions(-)

diff --git a/api/src/main/java/org/apache/iceberg/types/Conversions.java b/api/src/main/java/org/apache/iceberg/types/Conversions.java
index 8fe8055..ed795ca 100644
--- a/api/src/main/java/org/apache/iceberg/types/Conversions.java
+++ b/api/src/main/java/org/apache/iceberg/types/Conversions.java
@@ -98,9 +98,9 @@ public class Conversions {
         }
       case UUID:
         UUID uuid = (UUID) value;
-        return ByteBuffer.allocate(16).order(ByteOrder.LITTLE_ENDIAN)
+        return ByteBuffer.allocate(16).order(ByteOrder.BIG_ENDIAN)
             .putLong(0, uuid.getMostSignificantBits())
-            .putLong(1, uuid.getLeastSignificantBits());
+            .putLong(8, uuid.getLeastSignificantBits());
       case FIXED:
       case BINARY:
         return (ByteBuffer) value;
@@ -117,7 +117,12 @@ public class Conversions {
   }
 
   private static Object internalFromByteBuffer(Type type, ByteBuffer buffer) {
-    ByteBuffer tmp = buffer.duplicate().order(ByteOrder.LITTLE_ENDIAN);
+    ByteBuffer tmp = buffer.duplicate();
+    if (type == Types.UUIDType.get() || type instanceof Types.DecimalType) {
+      tmp.order(ByteOrder.BIG_ENDIAN);
+    } else {
+      tmp.order(ByteOrder.LITTLE_ENDIAN);
+    }
     switch (type.typeId()) {
       case BOOLEAN:
         return (tmp.get() != 0x00);
diff --git a/api/src/test/java/org/apache/iceberg/types/TestConversions.java b/api/src/test/java/org/apache/iceberg/types/TestConversions.java
new file mode 100644
index 0000000..78c4c71
--- /dev/null
+++ b/api/src/test/java/org/apache/iceberg/types/TestConversions.java
@@ -0,0 +1,123 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg.types;
+
+import java.math.BigDecimal;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.StandardCharsets;
+import java.util.UUID;
+import org.apache.iceberg.types.Types.BinaryType;
+import org.apache.iceberg.types.Types.BooleanType;
+import org.apache.iceberg.types.Types.DateType;
+import org.apache.iceberg.types.Types.DecimalType;
+import org.apache.iceberg.types.Types.DoubleType;
+import org.apache.iceberg.types.Types.FixedType;
+import org.apache.iceberg.types.Types.FloatType;
+import org.apache.iceberg.types.Types.IntegerType;
+import org.apache.iceberg.types.Types.LongType;
+import org.apache.iceberg.types.Types.StringType;
+import org.apache.iceberg.types.Types.TimeType;
+import org.apache.iceberg.types.Types.TimestampType;
+import org.apache.iceberg.types.Types.UUIDType;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class TestConversions {
+
+  @Test
+  public void testByteBufferConversions() {
+    // booleans are stored as 0x00 for 'false' and a non-zero byte for 'true'
+    assertConversion(false, BooleanType.get(), new byte[]{0x00});
+    assertConversion(true, BooleanType.get(), new byte[]{0x01});
+    // integers are stored as 4 bytes in little-endian order
+    // 84202 is 0...01|01001000|11101010 in binary
+    // 11101010 -> -22, 01001000 -> 72, 00000001 -> 1, 00000000 -> 0
+    assertConversion(84202, IntegerType.get(), new byte[]{-22, 72, 1, 0});
+    // longs are stored as 8 bytes in little-endian order
+    // 200L is 0...0|11001000 in binary
+    // 11001000 -> -56, 00000000 -> 0, ... , 00000000 -> 0
+    assertConversion(200L, LongType.get(), new byte[]{-56, 0, 0, 0, 0, 0, 0, 0});
+    // floats are stored as 4 bytes in little-endian order
+    // floating point numbers are represented as sign * 2ˆexponent * mantissa
+    // -4.5F is -1 * 2ˆ2 * 1.125 and encoded as 11000000|10010000|0...0 in binary
+    // 00000000 -> 0, 00000000 -> 0, 10010000 -> -112, 11000000 -> -64,
+    assertConversion(-4.5F, FloatType.get(), new byte[]{0, 0, -112, -64});
+    // doubles are stored as 8 bytes in little-endian order
+    // floating point numbers are represented as sign * 2ˆexponent * mantissa
+    // 6.0 is 1 * 2ˆ4 * 1.5 and encoded as 01000000|00011000|0...0
+    // 00000000 -> 0, ... , 00011000 -> 24, 01000000 -> 64
+    assertConversion(6.0, DoubleType.get(), new byte[]{0, 0, 0, 0, 0, 0, 24, 64});
+    // dates are stored as days from 1970-01-01 in a 4-byte little-endian int
+    // 1000 is 0...0|00000011|11101000 in binary
+    // 11101000 -> -24, 00000011 -> 3, ... , 00000000 -> 0
+    assertConversion(1000, DateType.get(), new byte[]{-24, 3, 0, 0});
+    // time is stored as microseconds from midnight in an 8-byte little-endian long
+    // 10000L is 0...0|00100111|00010000 in binary
+    // 00010000 -> 16, 00100111 -> 39, ... , 00000000 -> 0
+    assertConversion(10000L, TimeType.get(), new byte[]{16, 39, 0, 0, 0, 0, 0, 0});
+    // timestamps are stored as microseconds from 1970-01-01 00:00:00.000000 in an 8-byte little-endian long
+    // 400000L is 0...110|00011010|10000000 in binary
+    // 10000000 -> -128, 00011010 -> 26, 00000110 -> 6, ... , 00000000 -> 0
+    assertConversion(400000L, TimestampType.withoutZone(), new byte[]{-128, 26, 6, 0, 0, 0, 0, 0});
+    assertConversion(400000L, TimestampType.withZone(), new byte[]{-128, 26, 6, 0, 0, 0, 0, 0});
+    // strings are stored as UTF-8 bytes (without length)
+    // 'A' -> 65, 'B' -> 66, 'C' -> 67
+    assertConversion(CharBuffer.wrap("ABC"), StringType.get(), new byte[]{65, 66, 67});
+    // uuids are stored as 16-byte big-endian values
+    // f79c3e09-677c-4bbd-a479-3f349cb785e7 is encoded as F7 9C 3E 09 67 7C 4B BD A4 79 3F 34 9C B7 85 E7
+    // 0xF7 -> 11110111 -> -9, 0x9C -> 10011100 -> -100, 0x3E -> 00111110 -> 62,
+    // 0x09 -> 00001001 -> 9, 0x67 -> 01100111 -> 103, 0x7C -> 01111100 -> 124,
+    // 0x4B -> 01001011 -> 75, 0xBD -> 10111101 -> -67, 0xA4 -> 10100100 -> -92,
+    // 0x79 -> 01111001 -> 121, 0x3F -> 00111111 -> 63, 0x34 -> 00110100 -> 52,
+    // 0x9C -> 10011100 -> -100, 0xB7 -> 10110111 -> -73, 0x85 -> 10000101 -> -123,
+    // 0xE7 -> 11100111 -> -25
+    assertConversion(
+        UUID.fromString("f79c3e09-677c-4bbd-a479-3f349cb785e7"),
+        UUIDType.get(),
+        new byte[]{-9, -100, 62, 9, 103, 124, 75, -67, -92, 121, 63, 52, -100, -73, -123, -25});
+    // fixed values are stored directly
+    // 'a' -> 97, 'b' -> 98
+    assertConversion(
+        ByteBuffer.wrap("ab".getBytes(StandardCharsets.UTF_8)),
+        FixedType.ofLength(2),
+        new byte[]{97, 98});
+    // binary values are stored directly
+    // 'Z' -> 90
+    assertConversion(
+        ByteBuffer.wrap("Z".getBytes(StandardCharsets.UTF_8)),
+        BinaryType.get(),
+        new byte[]{90});
+    // decimals are stored as unscaled values in the form of two's-complement big-endian binary,
+    // using the minimum number of bytes for the values
+    // 345 is 0...1|01011001 in binary
+    // 00000001 -> 1, 01011001 -> 89
+    assertConversion(
+        new BigDecimal("3.45"),
+        DecimalType.of(3, 2),
+        new byte[]{1, 89});
+  }
+
+  private <T> void assertConversion(T value, Type type, byte[] expectedBinary) {
+    ByteBuffer byteBuffer = Conversions.toByteBuffer(type, value);
+    Assert.assertArrayEquals(expectedBinary, byteBuffer.array());
+    Assert.assertEquals(value, Conversions.fromByteBuffer(type, byteBuffer));
+  }
+}
diff --git a/parquet/src/test/java/org/apache/iceberg/parquet/TestParquetMetrics.java b/parquet/src/test/java/org/apache/iceberg/parquet/TestParquetMetrics.java
index da1e483..229e77a 100644
--- a/parquet/src/test/java/org/apache/iceberg/parquet/TestParquetMetrics.java
+++ b/parquet/src/test/java/org/apache/iceberg/parquet/TestParquetMetrics.java
@@ -145,9 +145,8 @@ public class TestParquetMetrics {
     assertBounds(9, TimeType.get(), 2000L, 3000L, metrics);
     assertCounts(10, 2L, 0L, metrics);
     assertBounds(10, TimestampType.withoutZone(), 0L, 1000L, metrics);
-    // TODO: enable once issue#126 is resolved
-    // assertCounts(11, 2L, 1L, metrics);
-    // assertBounds(11, UUIDType.get(), uuid, uuid, metrics);
+    assertCounts(11, 2L, 1L, metrics);
+    assertBounds(11, UUIDType.get(), uuid, uuid, metrics);
     assertCounts(12, 2L, 0L, metrics);
     assertBounds(12, FixedType.ofLength(4),
         ByteBuffer.wrap(fixed.bytes()), ByteBuffer.wrap(fixed.bytes()), metrics);