You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/11/07 21:22:24 UTC
[arrow] branch master updated: ARROW-1716: [Format/JSON] Use string
integer value for Decimals in JSON
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 3188d70 ARROW-1716: [Format/JSON] Use string integer value for Decimals in JSON
3188d70 is described below
commit 3188d70202795d8e0a8092ec5685d859b02e366d
Author: Phillip Cloud <cp...@gmail.com>
AuthorDate: Tue Nov 7 16:22:20 2017 -0500
ARROW-1716: [Format/JSON] Use string integer value for Decimals in JSON
Author: Phillip Cloud <cp...@gmail.com>
Closes #1267 from cpcloud/ARROW-1716 and squashes the following commits:
b4f3aed2 [Phillip Cloud] Add cases for every valid precision
f8d4391f [Phillip Cloud] Use the full range of decimal values in integration tests
6fef5f71 [Phillip Cloud] ARROW-1716: [Format/JSON] Use string integer value for Decimals in JSON
---
cpp/src/arrow/ipc/ipc-read-write-test.cc | 4 +-
cpp/src/arrow/ipc/json-internal.cc | 69 +++++++++++---
cpp/src/arrow/ipc/test-common.h | 13 ++-
integration/integration_test.py | 105 ++++++++++++---------
.../arrow/vector/file/json/JsonFileReader.java | 9 +-
.../arrow/vector/file/json/JsonFileWriter.java | 8 +-
.../apache/arrow/vector/util/DecimalUtility.java | 32 ++++++-
7 files changed, 165 insertions(+), 75 deletions(-)
diff --git a/cpp/src/arrow/ipc/ipc-read-write-test.cc b/cpp/src/arrow/ipc/ipc-read-write-test.cc
index 6f2f5cf..40cd3f0 100644
--- a/cpp/src/arrow/ipc/ipc-read-write-test.cc
+++ b/cpp/src/arrow/ipc/ipc-read-write-test.cc
@@ -727,7 +727,7 @@ TEST_F(TestTensorRoundTrip, BasicRoundtrip) {
int64_t size = 24;
std::vector<int64_t> values;
- test::randint<int64_t>(size, 0, 100, &values);
+ test::randint(size, 0, 100, &values);
auto data = test::GetBufferFromVector(values);
@@ -748,7 +748,7 @@ TEST_F(TestTensorRoundTrip, NonContiguous) {
ASSERT_OK(io::MemoryMapFixture::InitMemoryMap(kBufferSize, path, &mmap_));
std::vector<int64_t> values;
- test::randint<int64_t>(24, 0, 100, &values);
+ test::randint(24, 0, 100, &values);
auto data = test::GetBufferFromVector(values);
Tensor tensor(int64(), data, {4, 3}, {48, 16});
diff --git a/cpp/src/arrow/ipc/json-internal.cc b/cpp/src/arrow/ipc/json-internal.cc
index 025f6c2..c1c0661 100644
--- a/cpp/src/arrow/ipc/json-internal.cc
+++ b/cpp/src/arrow/ipc/json-internal.cc
@@ -33,6 +33,7 @@
#include "arrow/type.h"
#include "arrow/type_traits.h"
#include "arrow/util/bit-util.h"
+#include "arrow/util/decimal.h"
#include "arrow/util/logging.h"
#include "arrow/util/string.h"
#include "arrow/visitor_inline.h"
@@ -448,7 +449,8 @@ class ArrayWriter {
}
void WriteDataValues(const FixedSizeBinaryArray& arr) {
- int32_t width = arr.byte_width();
+ const int32_t width = arr.byte_width();
+
for (int64_t i = 0; i < arr.length(); ++i) {
const uint8_t* buf = arr.GetValue(i);
std::string encoded = HexEncode(buf, width);
@@ -456,6 +458,13 @@ class ArrayWriter {
}
}
+ void WriteDataValues(const DecimalArray& arr) {
+ for (int64_t i = 0; i < arr.length(); ++i) {
+ const Decimal128 value(arr.GetValue(i));
+ writer_->String(value.ToIntegerString());
+ }
+ }
+
void WriteDataValues(const BooleanArray& arr) {
for (int i = 0; i < arr.length(); ++i) {
writer_->Bool(arr.Value(i));
@@ -1053,7 +1062,9 @@ class ArrayReader {
}
template <typename T>
- typename std::enable_if<std::is_base_of<FixedSizeBinaryType, T>::value, Status>::type
+ typename std::enable_if<std::is_base_of<FixedSizeBinaryType, T>::value &&
+ !std::is_base_of<DecimalType, T>::value,
+ Status>::type
Visit(const T& type) {
typename TypeTraits<T>::BuilderType builder(type_, pool_);
@@ -1073,22 +1084,52 @@ class ArrayReader {
for (int i = 0; i < length_; ++i) {
if (!is_valid_[i]) {
RETURN_NOT_OK(builder.AppendNull());
- continue;
- }
+ } else {
+ const rj::Value& val = json_data_arr[i];
+ DCHECK(val.IsString())
+ << "Found non-string JSON value when parsing FixedSizeBinary value";
+ std::string hex_string = val.GetString();
+ if (static_cast<int32_t>(hex_string.size()) != byte_width * 2) {
+ DCHECK(false) << "Expected size: " << byte_width * 2
+ << " got: " << hex_string.size();
+ }
+ const char* hex_data = hex_string.c_str();
- const rj::Value& val = json_data_arr[i];
- DCHECK(val.IsString());
- std::string hex_string = val.GetString();
- if (static_cast<int32_t>(hex_string.size()) != byte_width * 2) {
- DCHECK(false) << "Expected size: " << byte_width * 2
- << " got: " << hex_string.size();
+ for (int32_t j = 0; j < byte_width; ++j) {
+ RETURN_NOT_OK(ParseHexValue(hex_data + j * 2, &byte_buffer_data[j]));
+ }
+ RETURN_NOT_OK(builder.Append(byte_buffer_data));
}
- const char* hex_data = hex_string.c_str();
+ }
+ return builder.Finish(&result_);
+ }
+
+ template <typename T>
+ typename std::enable_if<std::is_base_of<DecimalType, T>::value, Status>::type Visit(
+ const T& type) {
+ typename TypeTraits<T>::BuilderType builder(type_, pool_);
+
+ const auto& json_data = obj_->FindMember("DATA");
+ RETURN_NOT_ARRAY("DATA", json_data, *obj_);
- for (int32_t j = 0; j < byte_width; ++j) {
- RETURN_NOT_OK(ParseHexValue(hex_data + j * 2, &byte_buffer_data[j]));
+ const auto& json_data_arr = json_data->value.GetArray();
+
+ DCHECK_EQ(static_cast<int32_t>(json_data_arr.Size()), length_);
+
+ for (int i = 0; i < length_; ++i) {
+ if (!is_valid_[i]) {
+ RETURN_NOT_OK(builder.AppendNull());
+ } else {
+ const rj::Value& val = json_data_arr[i];
+ DCHECK(val.IsString())
+ << "Found non-string JSON value when parsing Decimal128 value";
+ DCHECK_GT(val.GetStringLength(), 0)
+ << "Empty string found when parsing Decimal128 value";
+
+ Decimal128 value;
+ RETURN_NOT_OK(Decimal128::FromString(val.GetString(), &value));
+ RETURN_NOT_OK(builder.Append(value));
}
- RETURN_NOT_OK(builder.Append(byte_buffer_data));
}
return builder.Finish(&result_);
}
diff --git a/cpp/src/arrow/ipc/test-common.h b/cpp/src/arrow/ipc/test-common.h
index b2137b7..91023db 100644
--- a/cpp/src/arrow/ipc/test-common.h
+++ b/cpp/src/arrow/ipc/test-common.h
@@ -671,8 +671,11 @@ Status MakeFWBinary(std::shared_ptr<RecordBatch>* out) {
}
Status MakeDecimal(std::shared_ptr<RecordBatch>* out) {
- auto f0 = field("f0", decimal(19, 4));
- auto schema = ::arrow::schema({f0, f0});
+ constexpr int kDecimalPrecision = 38;
+ auto type = decimal(kDecimalPrecision, 4);
+ auto f0 = field("f0", type);
+ auto f1 = field("f1", type);
+ auto schema = ::arrow::schema({f0, f1});
constexpr int kDecimalSize = 16;
constexpr int length = 10;
@@ -682,7 +685,7 @@ Status MakeDecimal(std::shared_ptr<RecordBatch>* out) {
RETURN_NOT_OK(AllocateBuffer(default_memory_pool(), kDecimalSize * length, &data));
- test::random_bytes(kDecimalSize * length, 0, data->mutable_data());
+ test::random_decimals(length, 1, kDecimalPrecision, data->mutable_data());
test::random_null_bytes(length, 0.1, is_valid_bytes.data());
RETURN_NOT_OK(BitUtil::BytesToBits(is_valid_bytes, default_memory_pool(), &is_valid));
@@ -690,10 +693,10 @@ Status MakeDecimal(std::shared_ptr<RecordBatch>* out) {
auto a1 = std::make_shared<DecimalArray>(f0->type(), length, data, is_valid,
kUnknownNullCount);
- auto a2 = std::make_shared<DecimalArray>(f0->type(), length, data);
+ auto a2 = std::make_shared<DecimalArray>(f1->type(), length, data);
ArrayVector arrays = {a1, a2};
- *out = std::make_shared<RecordBatch>(schema, a1->length(), arrays);
+ *out = std::make_shared<RecordBatch>(schema, length, arrays);
return Status::OK();
}
diff --git a/integration/integration_test.py b/integration/integration_test.py
index 59a1de5..205176e 100644
--- a/integration/integration_test.py
+++ b/integration/integration_test.py
@@ -65,24 +65,16 @@ def rands(nchars):
return ''.join(np.random.choice(RANDS_CHARS, nchars))
-if six.PY2:
- def frombytes(o):
- return o
+def tobytes(o):
+ if isinstance(o, six.text_type):
+ return o.encode('utf8')
+ return o
- def tobytes(o):
- if isinstance(o, unicode):
- return o.encode('utf8')
- else:
- return o
-else:
- def tobytes(o):
- if isinstance(o, str):
- return o.encode('utf8')
- else:
- return o
- def frombytes(o):
+def frombytes(o):
+ if isinstance(o, six.binary_type):
return o.decode('utf8')
+ return o
# from the merge_arrow_pr.py script
@@ -177,7 +169,7 @@ class PrimitiveType(DataType):
class PrimitiveColumn(Column):
def __init__(self, name, count, is_valid, values):
- Column.__init__(self, name, count)
+ super(PrimitiveColumn, self).__init__(name, count)
self.is_valid = is_valid
self.values = values
@@ -191,15 +183,16 @@ class PrimitiveColumn(Column):
]
-TEST_INT_MIN = - 2**31 + 1
-TEST_INT_MAX = 2**31 - 1
+TEST_INT_MAX = 2 ** 31 - 1
+TEST_INT_MIN = ~TEST_INT_MAX
+
class IntegerType(PrimitiveType):
def __init__(self, name, is_signed, bit_width, nullable=True,
min_value=TEST_INT_MIN,
max_value=TEST_INT_MAX):
- PrimitiveType.__init__(self, name, nullable=nullable)
+ super(IntegerType, self).__init__(name, nullable=nullable)
self.is_signed = is_signed
self.bit_width = bit_width
self.min_value = min_value
@@ -239,9 +232,11 @@ class DateType(IntegerType):
MILLISECOND = 1
def __init__(self, name, unit, nullable=True):
- self.unit = unit
bit_width = 32 if unit == self.DAY else 64
- IntegerType.__init__(self, name, True, bit_width, nullable=nullable)
+ super(DateType, self).__init__(
+ name, True, bit_width, nullable=nullable
+ )
+ self.unit = unit
def _get_type(self):
return OrderedDict([
@@ -268,9 +263,10 @@ class TimeType(IntegerType):
}
def __init__(self, name, unit='s', nullable=True):
+ super(TimeType, self).__init__(
+ name, True, self.BIT_WIDTHS[unit], nullable=nullable
+ )
self.unit = unit
- IntegerType.__init__(self, name, True, self.BIT_WIDTHS[unit],
- nullable=nullable)
def _get_type(self):
return OrderedDict([
@@ -283,9 +279,9 @@ class TimeType(IntegerType):
class TimestampType(IntegerType):
def __init__(self, name, unit='s', tz=None, nullable=True):
+ super(TimestampType, self).__init__(name, True, 64, nullable=nullable)
self.unit = unit
self.tz = tz
- IntegerType.__init__(self, name, True, 64, nullable=nullable)
def _get_type(self):
fields = [
@@ -302,7 +298,7 @@ class TimestampType(IntegerType):
class FloatingPointType(PrimitiveType):
def __init__(self, name, bit_width, nullable=True):
- PrimitiveType.__init__(self, name, nullable=nullable)
+ super(FloatingPointType, self).__init__(name, nullable=nullable)
self.bit_width = bit_width
self.precision = {
@@ -331,13 +327,30 @@ class FloatingPointType(PrimitiveType):
return PrimitiveColumn(name, size, is_valid, values)
-class DecimalType(PrimitiveType):
- def __init__(self, name, bit_width, precision, scale, nullable=True):
- PrimitiveType.__init__(self, name, nullable=True)
+DECIMAL_PRECISION_TO_VALUE = {
+ key: (1 << (8 * i - 1)) - 1 for i, key in enumerate(
+ [1, 3, 5, 7, 10, 12, 15, 17, 19, 22, 24, 27, 29, 32, 34, 36],
+ start=1,
+ )
+}
- self.bit_width = bit_width
+
+def decimal_range_from_precision(precision):
+ assert 1 <= precision <= 38
+ try:
+ max_value = DECIMAL_PRECISION_TO_VALUE[precision]
+ except KeyError:
+ return decimal_range_from_precision(precision - 1)
+ else:
+ return ~max_value, max_value
+
+
+class DecimalType(PrimitiveType):
+ def __init__(self, name, precision, scale, bit_width=128, nullable=True):
+ super(DecimalType, self).__init__(name, nullable=True)
self.precision = precision
self.scale = scale
+ self.bit_width = bit_width
@property
def numpy_type(self):
@@ -359,7 +372,8 @@ class DecimalType(PrimitiveType):
('typeBitWidth', self.bit_width)])])])
def generate_column(self, size, name=None):
- values = [random.randint(0, 2**self.bit_width - 1) for x in range(size)]
+ min_value, max_value = decimal_range_from_precision(self.precision)
+ values = [random.randint(min_value, max_value) for _ in range(size)]
is_valid = self._make_is_valid(size)
if name is None:
@@ -369,14 +383,12 @@ class DecimalType(PrimitiveType):
class DecimalColumn(PrimitiveColumn):
- def __init__(self, name, count, is_valid, values, bit_width):
- PrimitiveColumn.__init__(self, name, count, is_valid, values)
+ def __init__(self, name, count, is_valid, values, bit_width=128):
+ super(DecimalColumn, self).__init__(name, count, is_valid, values)
self.bit_width = bit_width
- self.hex_width = bit_width / 4
def _encode_value(self, x):
- hex_format_str = '%%0%dx' % self.hex_width
- return (hex_format_str % x).upper()
+ return str(x)
class BooleanType(PrimitiveType):
@@ -510,7 +522,7 @@ class StringColumn(BinaryColumn):
class ListType(DataType):
def __init__(self, name, value_type, nullable=True):
- DataType.__init__(self, name, nullable=nullable)
+ super(ListType, self).__init__(name, nullable=nullable)
self.value_type = value_type
def _get_type(self):
@@ -553,7 +565,7 @@ class ListType(DataType):
class ListColumn(Column):
def __init__(self, name, count, is_valid, offsets, values):
- Column.__init__(self, name, count)
+ super(ListColumn, self).__init__(name, count)
self.is_valid = is_valid
self.offsets = offsets
self.values = values
@@ -571,7 +583,7 @@ class ListColumn(Column):
class StructType(DataType):
def __init__(self, name, field_types, nullable=True):
- DataType.__init__(self, name, nullable=nullable)
+ super(StructType, self).__init__(name, nullable=nullable)
self.field_types = field_types
def _get_type(self):
@@ -620,7 +632,7 @@ class Dictionary(object):
class DictionaryType(DataType):
def __init__(self, name, index_type, dictionary, nullable=True):
- DataType.__init__(self, name, nullable=nullable)
+ super(DictionaryType, self).__init__(name, nullable=nullable)
assert isinstance(index_type, IntegerType)
assert isinstance(dictionary, Dictionary)
@@ -655,7 +667,7 @@ class DictionaryType(DataType):
class StructColumn(Column):
def __init__(self, name, count, is_valid, field_values):
- Column.__init__(self, name, count)
+ super(StructColumn, self).__init__(name, count)
self.is_valid = is_valid
self.field_values = field_values
@@ -758,11 +770,12 @@ def generate_primitive_case(batch_sizes):
def generate_decimal_case():
fields = [
- DecimalType('f1', 128, 24, 10, True),
- DecimalType('f2', 128, 32, -10, True)
+ DecimalType(name='f{}'.format(i), precision=precision, scale=2)
+ for i, precision in enumerate(range(3, 39))
]
- batch_sizes = [7, 10]
+ possible_batch_sizes = 7, 10
+ batch_sizes = [possible_batch_sizes[i % 2] for i in range(len(fields))]
return _generate_file('decimal', fields, batch_sizes)
@@ -867,8 +880,9 @@ class IntegrationRunner(object):
def _compare_implementations(self, producer, consumer):
print('##########################################################')
- print('{0} producing, {1} consuming'.format(producer.name,
- consumer.name))
+ print(
+ '{0} producing, {1} consuming'.format(producer.name, consumer.name)
+ )
print('##########################################################')
for json_path in self.json_files:
@@ -1033,6 +1047,7 @@ def run_all_tests(debug=False):
runner.run()
print('-- All tests passed!')
+
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Arrow integration test CLI')
parser.add_argument('--debug', dest='debug', action='store_true',
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileReader.java b/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileReader.java
index c6ebd61..e1c7c90 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileReader.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileReader.java
@@ -27,6 +27,8 @@ import static org.apache.arrow.vector.schema.ArrowVectorType.OFFSET;
import java.io.File;
import java.io.IOException;
+import java.math.BigDecimal;
+import java.math.BigInteger;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
@@ -332,9 +334,10 @@ public class JsonFileReader implements AutoCloseable, DictionaryProvider {
((Float8Vector) valueVector).getMutator().set(i, parser.readValueAs(Double.class));
break;
case DECIMAL: {
- DecimalVector decimalVector = ((DecimalVector) valueVector);
- byte[] value = decodeHexSafe(parser.readValueAs(String.class));
- DecimalUtility.writeByteArrayToArrowBuf(value, decimalVector.getBuffer(), i);
+ DecimalVector decimalVector = (DecimalVector) valueVector;
+ // Here we assume the decimal value is the unscaled integer value as a string
+ BigDecimal decimalValue = new BigDecimal(parser.readValueAs(String.class));
+ DecimalUtility.writeBigDecimalToArrowBuf(decimalValue, decimalVector.getBuffer(), i);
}
break;
case VARBINARY:
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileWriter.java b/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileWriter.java
index 04e4437..05341be 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileWriter.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileWriter.java
@@ -20,6 +20,7 @@ package org.apache.arrow.vector.file.json;
import java.io.File;
import java.io.IOException;
+import java.math.BigDecimal;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
@@ -48,6 +49,7 @@ import org.apache.arrow.vector.VectorSchemaRoot;
import org.apache.arrow.vector.dictionary.Dictionary;
import org.apache.arrow.vector.dictionary.DictionaryProvider;
import org.apache.arrow.vector.schema.ArrowVectorType;
+import org.apache.arrow.vector.types.pojo.ArrowType;
import org.apache.arrow.vector.types.pojo.Field;
import org.apache.arrow.vector.types.pojo.Schema;
@@ -242,9 +244,9 @@ public class JsonFileWriter implements AutoCloseable {
}
break;
case DECIMAL: {
- ArrowBuf bytebuf = valueVector.getDataBuffer();
- String hexString = Hex.encodeHexString(DecimalUtility.getByteArrayFromArrowBuf(bytebuf, i));
- generator.writeString(hexString);
+ BigDecimal decimalValue = ((DecimalVector) valueVector).getAccessor().getObject(i);
+ // We write the unscaled value, because the scale is stored in the type metadata.
+ generator.writeString(decimalValue.unscaledValue().toString());
}
break;
default:
diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java b/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java
index 033ae6c..acf7c58 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java
@@ -142,8 +142,18 @@ public class DecimalUtility {
*/
public static BigDecimal getBigDecimalFromArrowBuf(ArrowBuf bytebuf, int index, int scale) {
byte[] value = new byte[DECIMAL_BYTE_LENGTH];
+ byte temp;
final int startIndex = index * DECIMAL_BYTE_LENGTH;
+
+ // Decimal stored as little endian, need to swap bytes to make BigDecimal
bytebuf.getBytes(startIndex, value, 0, DECIMAL_BYTE_LENGTH);
+ int stop = DECIMAL_BYTE_LENGTH / 2;
+ for (int i = 0, j; i < stop; i++) {
+ temp = value[i];
+ j = (DECIMAL_BYTE_LENGTH - 1) - i;
+ value[i] = value[j];
+ value[j] = temp;
+ }
BigInteger unscaledValue = new BigInteger(value);
return new BigDecimal(unscaledValue, scale);
}
@@ -212,10 +222,26 @@ public class DecimalUtility {
if (bytes.length > DECIMAL_BYTE_LENGTH) {
throw new UnsupportedOperationException("Decimal size greater than 16 bytes");
}
- final int padLength = DECIMAL_BYTE_LENGTH - bytes.length;
- for (int i = 0; i < padLength; i++) {
+
+ // Decimal stored as little endian, need to swap data bytes before writing to ArrowBuf
+ byte[] bytesLE = new byte[bytes.length];
+ int stop = bytes.length / 2;
+ for (int i = 0, j; i < stop; i++) {
+ j = (bytes.length - 1) - i;
+ bytesLE[i] = bytes[j];
+ bytesLE[j] = bytes[i];
+ }
+ if (bytes.length % 2 != 0) {
+ int i = (bytes.length / 2);
+ bytesLE[i] = bytes[i];
+ }
+
+ // Write LE data
+ bytebuf.setBytes(startIndex, bytesLE, 0, bytes.length);
+
+ // Write padding after data
+ for (int i = bytes.length; i < DECIMAL_BYTE_LENGTH; i++) {
bytebuf.setByte(startIndex + i, padValue);
}
- bytebuf.setBytes(startIndex + padLength, bytes, 0, bytes.length);
}
}
--
To stop receiving notification emails like this one, please contact
['"commits@arrow.apache.org" <co...@arrow.apache.org>'].