You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/02/09 15:45:41 UTC
arrow git commit: ARROW-476: Add binary integration test fixture,
add Java support
Repository: arrow
Updated Branches:
refs/heads/master dc6cefde4 -> 3add9181f
ARROW-476: Add binary integration test fixture, add Java support
@julienledem could you review my Java changes? Thanks
Author: Wes McKinney <we...@twosigma.com>
Closes #326 from wesm/ARROW-476 and squashes the following commits:
a75228d [Wes McKinney] Use PoolBuffer instead of std::vector
e5a96a0 [Wes McKinney] Chain exceptions
b23b852 [Wes McKinney] Use hexadecimal for transporting binary data in JSON
1d4e850 [Wes McKinney] Compare byte[] with Arrays.equals
e5f13d5 [Wes McKinney] Add binary integration test fixture, add to JsonFileReader.java, but fails
Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/3add9181
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/3add9181
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/3add9181
Branch: refs/heads/master
Commit: 3add9181f98810bcfeae558bf44093d9ab89bc3f
Parents: dc6cefd
Author: Wes McKinney <we...@twosigma.com>
Authored: Thu Feb 9 10:45:35 2017 -0500
Committer: Wes McKinney <we...@twosigma.com>
Committed: Thu Feb 9 10:45:35 2017 -0500
----------------------------------------------------------------------
cpp/src/arrow/ipc/json-internal.cc | 55 ++++++++++++++++++--
integration/integration_test.py | 55 +++++++++++++++++---
java/vector/pom.xml | 5 ++
.../arrow/vector/file/json/JsonFileReader.java | 14 +++++
.../arrow/vector/file/json/JsonFileWriter.java | 6 +++
.../org/apache/arrow/vector/util/Validator.java | 4 ++
6 files changed, 129 insertions(+), 10 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/arrow/blob/3add9181/cpp/src/arrow/ipc/json-internal.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/json-internal.cc b/cpp/src/arrow/ipc/json-internal.cc
index 1a95b2c..b9f97dd 100644
--- a/cpp/src/arrow/ipc/json-internal.cc
+++ b/cpp/src/arrow/ipc/json-internal.cc
@@ -17,7 +17,10 @@
#include "arrow/ipc/json-internal.h"
+#include <algorithm>
#include <cstdint>
+#include <cstdlib>
+#include <iostream>
#include <memory>
#include <sstream>
#include <string>
@@ -40,6 +43,8 @@
namespace arrow {
namespace ipc {
+static const char* kAsciiTable = "0123456789ABCDEF";
+
using RjArray = rj::Value::ConstArray;
using RjObject = rj::Value::ConstObject;
@@ -395,14 +400,26 @@ class JsonArrayWriter : public ArrayVisitor {
}
}
- // String (Utf8), Binary
+ // Binary, encode to hexadecimal. UTF8 string write as is
template <typename T>
typename std::enable_if<std::is_base_of<BinaryArray, T>::value, void>::type
WriteDataValues(const T& arr) {
for (int i = 0; i < arr.length(); ++i) {
int32_t length;
const char* buf = reinterpret_cast<const char*>(arr.GetValue(i, &length));
- writer_->String(buf, length);
+
+ if (std::is_base_of<StringArray, T>::value) {
+ writer_->String(buf, length);
+ } else {
+ std::string hex_string;
+ hex_string.reserve(length * 2);
+ for (int32_t j = 0; j < length; ++j) {
+ // Convert to 2 base16 digits
+ hex_string.push_back(kAsciiTable[buf[j] >> 4]);
+ hex_string.push_back(kAsciiTable[buf[j] & 15]);
+ }
+ writer_->String(hex_string);
+ }
}
}
@@ -773,6 +790,20 @@ class JsonSchemaReader {
const rj::Value& json_schema_;
};
+static inline Status ParseHexValue(const char* data, uint8_t* out) {
+ char c1 = data[0];
+ char c2 = data[1];
+
+ const char* pos1 = std::lower_bound(kAsciiTable, kAsciiTable + 16, c1);
+ const char* pos2 = std::lower_bound(kAsciiTable, kAsciiTable + 16, c2);
+
+ // Error checking
+ if (*pos1 != c1 || *pos2 != c2) { return Status::Invalid("Encountered non-hex digit"); }
+
+ *out = (pos1 - kAsciiTable) << 4 | (pos2 - kAsciiTable);
+ return Status::OK();
+}
+
class JsonArrayReader {
public:
explicit JsonArrayReader(MemoryPool* pool) : pool_(pool) {}
@@ -852,6 +883,8 @@ class JsonArrayReader {
const auto& json_data_arr = json_data->value.GetArray();
DCHECK_EQ(static_cast<int32_t>(json_data_arr.Size()), length);
+
+ auto byte_buffer = std::make_shared<PoolBuffer>(pool_);
for (int i = 0; i < length; ++i) {
if (!is_valid[i]) {
builder.AppendNull();
@@ -860,7 +893,23 @@ class JsonArrayReader {
const rj::Value& val = json_data_arr[i];
DCHECK(val.IsString());
- builder.Append(val.GetString());
+ if (std::is_base_of<StringType, T>::value) {
+ builder.Append(val.GetString());
+ } else {
+ std::string hex_string = val.GetString();
+
+ DCHECK(hex_string.size() % 2 == 0) << "Expected base16 hex string";
+ int64_t length = static_cast<int>(hex_string.size()) / 2;
+
+ if (byte_buffer->size() < length) { RETURN_NOT_OK(byte_buffer->Resize(length)); }
+
+ const char* hex_data = hex_string.c_str();
+ uint8_t* byte_buffer_data = byte_buffer->mutable_data();
+ for (int64_t j = 0; j < length; ++j) {
+ RETURN_NOT_OK(ParseHexValue(hex_data + j * 2, &byte_buffer_data[j]));
+ }
+ RETURN_NOT_OK(builder.Append(byte_buffer_data, length));
+ }
}
return builder.Finish(array);
http://git-wip-us.apache.org/repos/asf/arrow/blob/3add9181/integration/integration_test.py
----------------------------------------------------------------------
diff --git a/integration/integration_test.py b/integration/integration_test.py
index a622bf2..1d8dc29 100644
--- a/integration/integration_test.py
+++ b/integration/integration_test.py
@@ -241,14 +241,18 @@ class BooleanType(PrimitiveType):
return PrimitiveColumn(self.name, size, is_valid, values)
-class StringType(PrimitiveType):
+class BinaryType(PrimitiveType):
@property
def numpy_type(self):
return object
+ @property
+ def column_class(self):
+ return BinaryColumn
+
def _get_type(self):
- return OrderedDict([('name', 'utf8')])
+ return OrderedDict([('name', 'binary')])
def _get_type_layout(self):
return OrderedDict([
@@ -267,11 +271,37 @@ class StringType(PrimitiveType):
for i in range(size):
if is_valid[i]:
+ draw = (np.random.randint(0, 255, size=K)
+ .astype(np.uint8)
+ .tostring())
+ values.append(draw)
+ else:
+ values.append("")
+
+ return self.column_class(self.name, size, is_valid, values)
+
+
+class StringType(BinaryType):
+
+ @property
+ def column_class(self):
+ return StringColumn
+
+ def _get_type(self):
+ return OrderedDict([('name', 'utf8')])
+
+ def generate_column(self, size):
+ K = 7
+ is_valid = self._make_is_valid(size)
+ values = []
+
+ for i in range(size):
+ if is_valid[i]:
values.append(rands(K))
else:
values.append("")
- return StringColumn(self.name, size, is_valid, values)
+ return self.column_class(self.name, size, is_valid, values)
class JSONSchema(object):
@@ -285,7 +315,10 @@ class JSONSchema(object):
])
-class StringColumn(PrimitiveColumn):
+class BinaryColumn(PrimitiveColumn):
+
+ def _encode_value(self, x):
+ return ''.join('{:02x}'.format(c).upper() for c in x)
def _get_buffers(self):
offset = 0
@@ -299,7 +332,7 @@ class StringColumn(PrimitiveColumn):
v = ""
offsets.append(offset)
- data.append(v)
+ data.append(self._encode_value(v))
return [
('VALIDITY', [int(x) for x in self.is_valid]),
@@ -308,6 +341,12 @@ class StringColumn(PrimitiveColumn):
]
+class StringColumn(BinaryColumn):
+
+ def _encode_value(self, x):
+ return x
+
+
class ListType(DataType):
def __init__(self, name, value_type, nullable=True):
@@ -443,7 +482,9 @@ class JSONFile(object):
def get_field(name, type_, nullable=True):
- if type_ == 'utf8':
+ if type_ == 'binary':
+ return BinaryType(name, nullable=nullable)
+ elif type_ == 'utf8':
return StringType(name, nullable=nullable)
dtype = np.dtype(type_)
@@ -463,7 +504,7 @@ def get_field(name, type_, nullable=True):
def generate_primitive_case():
types = ['bool', 'int8', 'int16', 'int32', 'int64',
'uint8', 'uint16', 'uint32', 'uint64',
- 'float32', 'float64', 'utf8']
+ 'float32', 'float64', 'binary', 'utf8']
fields = []
http://git-wip-us.apache.org/repos/asf/arrow/blob/3add9181/java/vector/pom.xml
----------------------------------------------------------------------
diff --git a/java/vector/pom.xml b/java/vector/pom.xml
index 64b68bf..8517d4c 100644
--- a/java/vector/pom.xml
+++ b/java/vector/pom.xml
@@ -56,6 +56,11 @@
<artifactId>commons-lang3</artifactId>
<version>3.4</version>
</dependency>
+ <dependency>
+ <groupId>commons-codec</groupId>
+ <artifactId>commons-codec</artifactId>
+ <version>1.10</version>
+ </dependency>
</dependencies>
<pluginRepositories>
http://git-wip-us.apache.org/repos/asf/arrow/blob/3add9181/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileReader.java
----------------------------------------------------------------------
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileReader.java b/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileReader.java
index 71fe88e..24fdc18 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileReader.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileReader.java
@@ -48,6 +48,7 @@ import org.apache.arrow.vector.UInt4Vector;
import org.apache.arrow.vector.UInt8Vector;
import org.apache.arrow.vector.ValueVector;
import org.apache.arrow.vector.ValueVector.Mutator;
+import org.apache.arrow.vector.VarBinaryVector;
import org.apache.arrow.vector.VarCharVector;
import org.apache.arrow.vector.VectorSchemaRoot;
import org.apache.arrow.vector.complex.NullableMapVector;
@@ -60,6 +61,8 @@ import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.core.JsonToken;
import com.fasterxml.jackson.databind.MappingJsonFactory;
import com.google.common.base.Objects;
+import org.apache.commons.codec.DecoderException;
+import org.apache.commons.codec.binary.Hex;
public class JsonFileReader implements AutoCloseable {
private final File inputFile;
@@ -164,6 +167,14 @@ public class JsonFileReader implements AutoCloseable {
readToken(END_OBJECT);
}
+ private byte[] decodeHexSafe(String hexString) throws IOException {
+ try {
+ return Hex.decodeHex(hexString.toCharArray());
+ } catch (DecoderException e) {
+ throw new IOException("Unable to decode hex string: " + hexString, e);
+ }
+ }
+
private void setValueFromParser(ValueVector valueVector, int i) throws IOException {
switch (valueVector.getMinorType()) {
case BIT:
@@ -199,6 +210,9 @@ public class JsonFileReader implements AutoCloseable {
case FLOAT8:
((Float8Vector)valueVector).getMutator().set(i, parser.readValueAs(Double.class));
break;
+ case VARBINARY:
+ ((VarBinaryVector)valueVector).getMutator().setSafe(i, decodeHexSafe(parser.readValueAs(String.class)));
+ break;
case VARCHAR:
((VarCharVector)valueVector).getMutator().setSafe(i, parser.readValueAs(String.class).getBytes(UTF_8));
break;
http://git-wip-us.apache.org/repos/asf/arrow/blob/3add9181/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileWriter.java
----------------------------------------------------------------------
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileWriter.java b/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileWriter.java
index ddc8043..99040b6 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileWriter.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileWriter.java
@@ -30,6 +30,7 @@ import org.apache.arrow.vector.TimeStampMicroVector;
import org.apache.arrow.vector.TimeStampNanoVector;
import org.apache.arrow.vector.ValueVector;
import org.apache.arrow.vector.ValueVector.Accessor;
+import org.apache.arrow.vector.VarBinaryVector;
import org.apache.arrow.vector.VectorSchemaRoot;
import org.apache.arrow.vector.schema.ArrowVectorType;
import org.apache.arrow.vector.types.pojo.Field;
@@ -40,6 +41,7 @@ import com.fasterxml.jackson.core.JsonGenerator;
import com.fasterxml.jackson.core.util.DefaultPrettyPrinter;
import com.fasterxml.jackson.core.util.DefaultPrettyPrinter.NopIndenter;
import com.fasterxml.jackson.databind.MappingJsonFactory;
+import org.apache.commons.codec.binary.Hex;
public class JsonFileWriter implements AutoCloseable {
@@ -157,6 +159,10 @@ public class JsonFileWriter implements AutoCloseable {
case BIT:
generator.writeNumber(((BitVector)valueVector).getAccessor().get(i));
break;
+ case VARBINARY:
+ String hexString = Hex.encodeHexString(((VarBinaryVector) valueVector).getAccessor().get(i));
+ generator.writeObject(hexString);
+ break;
default:
// TODO: each type
Accessor accessor = valueVector.getAccessor();
http://git-wip-us.apache.org/repos/asf/arrow/blob/3add9181/java/vector/src/main/java/org/apache/arrow/vector/util/Validator.java
----------------------------------------------------------------------
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/Validator.java b/java/vector/src/main/java/org/apache/arrow/vector/util/Validator.java
index a974582..f294e20 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/util/Validator.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/util/Validator.java
@@ -17,6 +17,7 @@
*/
package org.apache.arrow.vector.util;
+import java.util.Arrays;
import java.util.List;
import org.apache.arrow.vector.FieldVector;
@@ -89,7 +90,10 @@ public class Validator {
default:
throw new UnsupportedOperationException("unsupported precision: " + fpType);
}
+ } else if (type instanceof ArrowType.Binary) {
+ return Arrays.equals((byte[]) o1, (byte[]) o2);
}
+
return Objects.equal(o1, o2);
}