You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/02/09 15:45:41 UTC

arrow git commit: ARROW-476: Add binary integration test fixture, add Java support

Repository: arrow
Updated Branches:
  refs/heads/master dc6cefde4 -> 3add9181f


ARROW-476: Add binary integration test fixture, add Java support

@julienledem could you review my Java changes? Thanks

Author: Wes McKinney <we...@twosigma.com>

Closes #326 from wesm/ARROW-476 and squashes the following commits:

a75228d [Wes McKinney] Use PoolBuffer instead of std::vector
e5a96a0 [Wes McKinney] Chain exceptions
b23b852 [Wes McKinney] Use hexadecimal for transporting binary data in JSON
1d4e850 [Wes McKinney] Compare byte[] with Arrays.equals
e5f13d5 [Wes McKinney] Add binary integration test fixture, add to JsonFileReader.java, but fails


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/3add9181
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/3add9181
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/3add9181

Branch: refs/heads/master
Commit: 3add9181f98810bcfeae558bf44093d9ab89bc3f
Parents: dc6cefd
Author: Wes McKinney <we...@twosigma.com>
Authored: Thu Feb 9 10:45:35 2017 -0500
Committer: Wes McKinney <we...@twosigma.com>
Committed: Thu Feb 9 10:45:35 2017 -0500

----------------------------------------------------------------------
 cpp/src/arrow/ipc/json-internal.cc              | 55 ++++++++++++++++++--
 integration/integration_test.py                 | 55 +++++++++++++++++---
 java/vector/pom.xml                             |  5 ++
 .../arrow/vector/file/json/JsonFileReader.java  | 14 +++++
 .../arrow/vector/file/json/JsonFileWriter.java  |  6 +++
 .../org/apache/arrow/vector/util/Validator.java |  4 ++
 6 files changed, 129 insertions(+), 10 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/3add9181/cpp/src/arrow/ipc/json-internal.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/json-internal.cc b/cpp/src/arrow/ipc/json-internal.cc
index 1a95b2c..b9f97dd 100644
--- a/cpp/src/arrow/ipc/json-internal.cc
+++ b/cpp/src/arrow/ipc/json-internal.cc
@@ -17,7 +17,10 @@
 
 #include "arrow/ipc/json-internal.h"
 
+#include <algorithm>
 #include <cstdint>
+#include <cstdlib>
+#include <iostream>
 #include <memory>
 #include <sstream>
 #include <string>
@@ -40,6 +43,8 @@
 namespace arrow {
 namespace ipc {
 
+static const char* kAsciiTable = "0123456789ABCDEF";
+
 using RjArray = rj::Value::ConstArray;
 using RjObject = rj::Value::ConstObject;
 
@@ -395,14 +400,26 @@ class JsonArrayWriter : public ArrayVisitor {
     }
   }
 
-  // String (Utf8), Binary
+  // Binary, encode to hexadecimal. UTF8 string write as is
   template <typename T>
   typename std::enable_if<std::is_base_of<BinaryArray, T>::value, void>::type
   WriteDataValues(const T& arr) {
     for (int i = 0; i < arr.length(); ++i) {
       int32_t length;
       const char* buf = reinterpret_cast<const char*>(arr.GetValue(i, &length));
-      writer_->String(buf, length);
+
+      if (std::is_base_of<StringArray, T>::value) {
+        writer_->String(buf, length);
+      } else {
+        std::string hex_string;
+        hex_string.reserve(length * 2);
+        for (int32_t j = 0; j < length; ++j) {
+          // Convert to 2 base16 digits
+          hex_string.push_back(kAsciiTable[buf[j] >> 4]);
+          hex_string.push_back(kAsciiTable[buf[j] & 15]);
+        }
+        writer_->String(hex_string);
+      }
     }
   }
 
@@ -773,6 +790,20 @@ class JsonSchemaReader {
   const rj::Value& json_schema_;
 };
 
+static inline Status ParseHexValue(const char* data, uint8_t* out) {
+  char c1 = data[0];
+  char c2 = data[1];
+
+  const char* pos1 = std::lower_bound(kAsciiTable, kAsciiTable + 16, c1);
+  const char* pos2 = std::lower_bound(kAsciiTable, kAsciiTable + 16, c2);
+
+  // Error checking
+  if (*pos1 != c1 || *pos2 != c2) { return Status::Invalid("Encountered non-hex digit"); }
+
+  *out = (pos1 - kAsciiTable) << 4 | (pos2 - kAsciiTable);
+  return Status::OK();
+}
+
 class JsonArrayReader {
  public:
   explicit JsonArrayReader(MemoryPool* pool) : pool_(pool) {}
@@ -852,6 +883,8 @@ class JsonArrayReader {
     const auto& json_data_arr = json_data->value.GetArray();
 
     DCHECK_EQ(static_cast<int32_t>(json_data_arr.Size()), length);
+
+    auto byte_buffer = std::make_shared<PoolBuffer>(pool_);
     for (int i = 0; i < length; ++i) {
       if (!is_valid[i]) {
         builder.AppendNull();
@@ -860,7 +893,23 @@ class JsonArrayReader {
 
       const rj::Value& val = json_data_arr[i];
       DCHECK(val.IsString());
-      builder.Append(val.GetString());
+      if (std::is_base_of<StringType, T>::value) {
+        builder.Append(val.GetString());
+      } else {
+        std::string hex_string = val.GetString();
+
+        DCHECK(hex_string.size() % 2 == 0) << "Expected base16 hex string";
+        int64_t length = static_cast<int>(hex_string.size()) / 2;
+
+        if (byte_buffer->size() < length) { RETURN_NOT_OK(byte_buffer->Resize(length)); }
+
+        const char* hex_data = hex_string.c_str();
+        uint8_t* byte_buffer_data = byte_buffer->mutable_data();
+        for (int64_t j = 0; j < length; ++j) {
+          RETURN_NOT_OK(ParseHexValue(hex_data + j * 2, &byte_buffer_data[j]));
+        }
+        RETURN_NOT_OK(builder.Append(byte_buffer_data, length));
+      }
     }
 
     return builder.Finish(array);

http://git-wip-us.apache.org/repos/asf/arrow/blob/3add9181/integration/integration_test.py
----------------------------------------------------------------------
diff --git a/integration/integration_test.py b/integration/integration_test.py
index a622bf2..1d8dc29 100644
--- a/integration/integration_test.py
+++ b/integration/integration_test.py
@@ -241,14 +241,18 @@ class BooleanType(PrimitiveType):
         return PrimitiveColumn(self.name, size, is_valid, values)
 
 
-class StringType(PrimitiveType):
+class BinaryType(PrimitiveType):
 
     @property
     def numpy_type(self):
         return object
 
+    @property
+    def column_class(self):
+        return BinaryColumn
+
     def _get_type(self):
-        return OrderedDict([('name', 'utf8')])
+        return OrderedDict([('name', 'binary')])
 
     def _get_type_layout(self):
         return OrderedDict([
@@ -267,11 +271,37 @@ class StringType(PrimitiveType):
 
         for i in range(size):
             if is_valid[i]:
+                draw = (np.random.randint(0, 255, size=K)
+                        .astype(np.uint8)
+                        .tostring())
+                values.append(draw)
+            else:
+                values.append("")
+
+        return self.column_class(self.name, size, is_valid, values)
+
+
+class StringType(BinaryType):
+
+    @property
+    def column_class(self):
+        return StringColumn
+
+    def _get_type(self):
+        return OrderedDict([('name', 'utf8')])
+
+    def generate_column(self, size):
+        K = 7
+        is_valid = self._make_is_valid(size)
+        values = []
+
+        for i in range(size):
+            if is_valid[i]:
                 values.append(rands(K))
             else:
                 values.append("")
 
-        return StringColumn(self.name, size, is_valid, values)
+        return self.column_class(self.name, size, is_valid, values)
 
 
 class JSONSchema(object):
@@ -285,7 +315,10 @@ class JSONSchema(object):
         ])
 
 
-class StringColumn(PrimitiveColumn):
+class BinaryColumn(PrimitiveColumn):
+
+    def _encode_value(self, x):
+        return ''.join('{:02x}'.format(c).upper() for c in x)
 
     def _get_buffers(self):
         offset = 0
@@ -299,7 +332,7 @@ class StringColumn(PrimitiveColumn):
                 v = ""
 
             offsets.append(offset)
-            data.append(v)
+            data.append(self._encode_value(v))
 
         return [
             ('VALIDITY', [int(x) for x in self.is_valid]),
@@ -308,6 +341,12 @@ class StringColumn(PrimitiveColumn):
         ]
 
 
+class StringColumn(BinaryColumn):
+
+    def _encode_value(self, x):
+        return x
+
+
 class ListType(DataType):
 
     def __init__(self, name, value_type, nullable=True):
@@ -443,7 +482,9 @@ class JSONFile(object):
 
 
 def get_field(name, type_, nullable=True):
-    if type_ == 'utf8':
+    if type_ == 'binary':
+        return BinaryType(name, nullable=nullable)
+    elif type_ == 'utf8':
         return StringType(name, nullable=nullable)
 
     dtype = np.dtype(type_)
@@ -463,7 +504,7 @@ def get_field(name, type_, nullable=True):
 def generate_primitive_case():
     types = ['bool', 'int8', 'int16', 'int32', 'int64',
              'uint8', 'uint16', 'uint32', 'uint64',
-             'float32', 'float64', 'utf8']
+             'float32', 'float64', 'binary', 'utf8']
 
     fields = []
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/3add9181/java/vector/pom.xml
----------------------------------------------------------------------
diff --git a/java/vector/pom.xml b/java/vector/pom.xml
index 64b68bf..8517d4c 100644
--- a/java/vector/pom.xml
+++ b/java/vector/pom.xml
@@ -56,6 +56,11 @@
       <artifactId>commons-lang3</artifactId>
       <version>3.4</version>
     </dependency>
+    <dependency>
+      <groupId>commons-codec</groupId>
+      <artifactId>commons-codec</artifactId>
+      <version>1.10</version>
+    </dependency>
   </dependencies>
 
     <pluginRepositories>

http://git-wip-us.apache.org/repos/asf/arrow/blob/3add9181/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileReader.java
----------------------------------------------------------------------
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileReader.java b/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileReader.java
index 71fe88e..24fdc18 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileReader.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileReader.java
@@ -48,6 +48,7 @@ import org.apache.arrow.vector.UInt4Vector;
 import org.apache.arrow.vector.UInt8Vector;
 import org.apache.arrow.vector.ValueVector;
 import org.apache.arrow.vector.ValueVector.Mutator;
+import org.apache.arrow.vector.VarBinaryVector;
 import org.apache.arrow.vector.VarCharVector;
 import org.apache.arrow.vector.VectorSchemaRoot;
 import org.apache.arrow.vector.complex.NullableMapVector;
@@ -60,6 +61,8 @@ import com.fasterxml.jackson.core.JsonParser;
 import com.fasterxml.jackson.core.JsonToken;
 import com.fasterxml.jackson.databind.MappingJsonFactory;
 import com.google.common.base.Objects;
+import org.apache.commons.codec.DecoderException;
+import org.apache.commons.codec.binary.Hex;
 
 public class JsonFileReader implements AutoCloseable {
   private final File inputFile;
@@ -164,6 +167,14 @@ public class JsonFileReader implements AutoCloseable {
     readToken(END_OBJECT);
   }
 
+  private byte[] decodeHexSafe(String hexString) throws IOException {
+    try {
+      return Hex.decodeHex(hexString.toCharArray());
+    } catch (DecoderException e) {
+      throw new IOException("Unable to decode hex string: " + hexString, e);
+    }
+  }
+
   private void setValueFromParser(ValueVector valueVector, int i) throws IOException {
     switch (valueVector.getMinorType()) {
     case BIT:
@@ -199,6 +210,9 @@ public class JsonFileReader implements AutoCloseable {
     case FLOAT8:
       ((Float8Vector)valueVector).getMutator().set(i, parser.readValueAs(Double.class));
       break;
+    case VARBINARY:
+      ((VarBinaryVector)valueVector).getMutator().setSafe(i, decodeHexSafe(parser.readValueAs(String.class)));
+      break;
     case VARCHAR:
       ((VarCharVector)valueVector).getMutator().setSafe(i, parser.readValueAs(String.class).getBytes(UTF_8));
       break;

http://git-wip-us.apache.org/repos/asf/arrow/blob/3add9181/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileWriter.java
----------------------------------------------------------------------
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileWriter.java b/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileWriter.java
index ddc8043..99040b6 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileWriter.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileWriter.java
@@ -30,6 +30,7 @@ import org.apache.arrow.vector.TimeStampMicroVector;
 import org.apache.arrow.vector.TimeStampNanoVector;
 import org.apache.arrow.vector.ValueVector;
 import org.apache.arrow.vector.ValueVector.Accessor;
+import org.apache.arrow.vector.VarBinaryVector;
 import org.apache.arrow.vector.VectorSchemaRoot;
 import org.apache.arrow.vector.schema.ArrowVectorType;
 import org.apache.arrow.vector.types.pojo.Field;
@@ -40,6 +41,7 @@ import com.fasterxml.jackson.core.JsonGenerator;
 import com.fasterxml.jackson.core.util.DefaultPrettyPrinter;
 import com.fasterxml.jackson.core.util.DefaultPrettyPrinter.NopIndenter;
 import com.fasterxml.jackson.databind.MappingJsonFactory;
+import org.apache.commons.codec.binary.Hex;
 
 public class JsonFileWriter implements AutoCloseable {
 
@@ -157,6 +159,10 @@ public class JsonFileWriter implements AutoCloseable {
       case BIT:
         generator.writeNumber(((BitVector)valueVector).getAccessor().get(i));
         break;
+      case VARBINARY:
+        String hexString = Hex.encodeHexString(((VarBinaryVector) valueVector).getAccessor().get(i));
+        generator.writeObject(hexString);
+        break;
       default:
         // TODO: each type
         Accessor accessor = valueVector.getAccessor();

http://git-wip-us.apache.org/repos/asf/arrow/blob/3add9181/java/vector/src/main/java/org/apache/arrow/vector/util/Validator.java
----------------------------------------------------------------------
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/Validator.java b/java/vector/src/main/java/org/apache/arrow/vector/util/Validator.java
index a974582..f294e20 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/util/Validator.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/util/Validator.java
@@ -17,6 +17,7 @@
  */
 package org.apache.arrow.vector.util;
 
+import java.util.Arrays;
 import java.util.List;
 
 import org.apache.arrow.vector.FieldVector;
@@ -89,7 +90,10 @@ public class Validator {
                 default:
                     throw new UnsupportedOperationException("unsupported precision: " + fpType);
             }
+        } else if (type instanceof ArrowType.Binary) {
+            return Arrays.equals((byte[]) o1, (byte[]) o2);
         }
+
         return Objects.equal(o1, o2);
     }