You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2018/12/20 22:20:28 UTC
[arrow] branch master updated: ARROW-3982: [C++] Allow "binary"
input in simple JSON format
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 1a86ab5 ARROW-3982: [C++] Allow "binary" input in simple JSON format
1a86ab5 is described below
commit 1a86ab51d8ee86e132645c9671f5355774b8f71b
Author: Antoine Pitrou <an...@python.org>
AuthorDate: Thu Dec 20 16:19:42 2018 -0600
ARROW-3982: [C++] Allow "binary" input in simple JSON format
Since rapidjson doesn't validate UTF8 by default, we can represent arbitrary binary bytes in the JSON input (bytes < 0x20 have to be represented as unicode escapes).
Author: Antoine Pitrou <an...@python.org>
Closes #3222 from pitrou/ARROW-3982-json-simple-binary and squashes the following commits:
5aaa5edc8 <Antoine Pitrou> ARROW-3982: Allow "binary" input in simple JSON format
---
cpp/src/arrow/ipc/json-simple-test.cc | 40 +++++++++++++++++++++++++++++++
cpp/src/arrow/ipc/json-simple.cc | 45 ++++++++++++++++++++++++++++++++---
cpp/src/arrow/pretty_print-test.cc | 11 ++-------
3 files changed, 84 insertions(+), 12 deletions(-)
diff --git a/cpp/src/arrow/ipc/json-simple-test.cc b/cpp/src/arrow/ipc/json-simple-test.cc
index 84a2210..2e80a0c 100644
--- a/cpp/src/arrow/ipc/json-simple-test.cc
+++ b/cpp/src/arrow/ipc/json-simple-test.cc
@@ -289,6 +289,7 @@ TEST(TestDouble, Errors) {
}
TEST(TestString, Basics) {
+ // String type
std::shared_ptr<DataType> type = utf8();
std::shared_ptr<Array> expected, actual;
@@ -300,6 +301,20 @@ TEST(TestString, Basics) {
s += '\x00';
s += "char";
AssertJSONArray<StringType, std::string>(type, "[\"\", \"some\\u0000char\"]", {"", s});
+ // UTF8 sequence in string
+ AssertJSONArray<StringType, std::string>(type, "[\"\xc3\xa9\"]", {"\xc3\xa9"});
+
+ // Binary type
+ type = binary();
+ AssertJSONArray<BinaryType, std::string>(type, "[\"\", \"foo\", null]",
+ {true, true, false}, {"", "foo", ""});
+ // Arbitrary binary (non-UTF8) sequence in string
+ s = "\xff\x9f";
+ AssertJSONArray<BinaryType, std::string>(type, "[\"" + s + "\"]", {s});
+ // Bytes < 0x20 can be represented as JSON unicode escapes
+ s = '\x00';
+ s += "\x1f";
+ AssertJSONArray<BinaryType, std::string>(type, "[\"\\u0000\\u001f\"]", {s});
}
TEST(TestString, Errors) {
@@ -310,6 +325,31 @@ TEST(TestString, Errors) {
ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[]]", &array));
}
+TEST(TestFixedSizeBinary, Basics) {
+ std::shared_ptr<DataType> type = fixed_size_binary(3);
+ std::shared_ptr<Array> expected, actual;
+
+ AssertJSONArray<FixedSizeBinaryType, std::string>(type, "[]", {});
+ AssertJSONArray<FixedSizeBinaryType, std::string>(type, "[\"foo\", \"bar\"]",
+ {"foo", "bar"});
+ AssertJSONArray<FixedSizeBinaryType, std::string>(type, "[null, \"foo\"]",
+ {false, true}, {"", "foo"});
+ // Arbitrary binary (non-UTF8) sequence in string
+ std::string s = "\xff\x9f\xcc";
+ AssertJSONArray<FixedSizeBinaryType, std::string>(type, "[\"" + s + "\"]", {s});
+}
+
+TEST(TestFixedSizeBinary, Errors) {
+ std::shared_ptr<DataType> type = fixed_size_binary(3);
+ std::shared_ptr<Array> array;
+
+ ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[0]", &array));
+ ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[]]", &array));
+ // Invalid length
+ ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[\"\"]", &array));
+ ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[\"abcd\"]", &array));
+}
+
TEST(TestDecimal, Basics) {
std::shared_ptr<DataType> type = decimal(10, 4);
std::shared_ptr<Array> expected, actual;
diff --git a/cpp/src/arrow/ipc/json-simple.cc b/cpp/src/arrow/ipc/json-simple.cc
index d812f84..7a78fe4 100644
--- a/cpp/src/arrow/ipc/json-simple.cc
+++ b/cpp/src/arrow/ipc/json-simple.cc
@@ -41,7 +41,8 @@ using ::arrow::internal::checked_cast;
static constexpr auto kParseFlags = rj::kParseFullPrecisionFlag | rj::kParseNanAndInfFlag;
static Status JSONTypeError(const char* expected_type, rj::Type json_type) {
- return Status::Invalid("Expected ", expected_type, " or null, got type ", json_type);
+ return Status::Invalid("Expected ", expected_type, " or null, got JSON type ",
+ json_type);
}
class Converter {
@@ -91,7 +92,6 @@ class ConcreteConverter : public Converter {
};
// TODO : dates and times?
-// TODO : binary / fixed size binary?
// ------------------------------------------------------------------------
// Converter for null arrays
@@ -284,7 +284,7 @@ class DecimalConverter final : public ConcreteConverter<DecimalConverter> {
};
// ------------------------------------------------------------------------
-// Converter for string arrays
+// Converter for binary and string arrays
class StringConverter final : public ConcreteConverter<StringConverter> {
public:
@@ -314,6 +314,43 @@ class StringConverter final : public ConcreteConverter<StringConverter> {
};
// ------------------------------------------------------------------------
+// Converter for fixed-size binary arrays
+
+class FixedSizeBinaryConverter final
+ : public ConcreteConverter<FixedSizeBinaryConverter> {
+ public:
+ explicit FixedSizeBinaryConverter(const std::shared_ptr<DataType>& type) {
+ this->type_ = type;
+ builder_ = std::make_shared<FixedSizeBinaryBuilder>(type, default_memory_pool());
+ }
+
+ Status AppendNull() override { return builder_->AppendNull(); }
+
+ Status AppendValue(const rj::Value& json_obj) override {
+ if (json_obj.IsNull()) {
+ return AppendNull();
+ }
+ if (json_obj.IsString()) {
+ auto view = util::string_view(json_obj.GetString(), json_obj.GetStringLength());
+ if (view.length() != static_cast<size_t>(builder_->byte_width())) {
+ std::stringstream ss;
+ ss << "Invalid string length " << view.length() << " in JSON input for "
+ << this->type_->ToString();
+ return Status::Invalid(ss.str());
+ }
+ return builder_->Append(view);
+ } else {
+ return JSONTypeError("string", json_obj.GetType());
+ }
+ }
+
+ std::shared_ptr<ArrayBuilder> builder() override { return builder_; }
+
+ protected:
+ std::shared_ptr<FixedSizeBinaryBuilder> builder_;
+};
+
+// ------------------------------------------------------------------------
// Converter for list arrays
class ListConverter final : public ConcreteConverter<ListConverter> {
@@ -449,6 +486,8 @@ Status GetConverter(const std::shared_ptr<DataType>& type,
SIMPLE_CONVERTER_CASE(Type::LIST, ListConverter)
SIMPLE_CONVERTER_CASE(Type::STRUCT, StructConverter)
SIMPLE_CONVERTER_CASE(Type::STRING, StringConverter)
+ SIMPLE_CONVERTER_CASE(Type::BINARY, StringConverter)
+ SIMPLE_CONVERTER_CASE(Type::FIXED_SIZE_BINARY, FixedSizeBinaryConverter)
SIMPLE_CONVERTER_CASE(Type::DECIMAL, DecimalConverter)
default: {
return Status::NotImplemented("JSON conversion to ", type->ToString(),
diff --git a/cpp/src/arrow/pretty_print-test.cc b/cpp/src/arrow/pretty_print-test.cc
index a1acfb8..8696efc 100644
--- a/cpp/src/arrow/pretty_print-test.cc
+++ b/cpp/src/arrow/pretty_print-test.cc
@@ -277,18 +277,11 @@ TEST_F(TestPrettyPrint, ListType) {
TEST_F(TestPrettyPrint, FixedSizeBinaryType) {
std::vector<bool> is_valid = {true, true, false, true, false};
- std::vector<std::string> values = {"foo", "bar", "baz"};
- std::shared_ptr<Array> array;
auto type = fixed_size_binary(3);
- FixedSizeBinaryBuilder builder(type);
-
- ASSERT_OK(builder.Append(values[0]));
- ASSERT_OK(builder.Append(values[1]));
- ASSERT_OK(builder.Append(values[2]));
- ASSERT_OK(builder.Finish(&array));
+ auto array = ArrayFromJSON(type, "[\"foo\", \"bar\", null, \"baz\"]");
- static const char* ex = "[\n 666F6F,\n 626172,\n 62617A\n]";
+ static const char* ex = "[\n 666F6F,\n 626172,\n null,\n 62617A\n]";
CheckArray(*array, {0, 10}, ex);
static const char* ex_2 = " [\n 666F6F,\n ...\n 62617A\n ]";
CheckArray(*array, {2, 1}, ex_2);