You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2018/12/20 22:20:28 UTC

[arrow] branch master updated: ARROW-3982: [C++] Allow "binary" input in simple JSON format

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 1a86ab5  ARROW-3982: [C++] Allow "binary" input in simple JSON format
1a86ab5 is described below

commit 1a86ab51d8ee86e132645c9671f5355774b8f71b
Author: Antoine Pitrou <an...@python.org>
AuthorDate: Thu Dec 20 16:19:42 2018 -0600

    ARROW-3982: [C++] Allow "binary" input in simple JSON format
    
    Since rapidjson doesn't validate UTF8 by default, we can represent arbitrary binary bytes in the JSON input (bytes < 0x20 have to be represented as unicode escapes).
    
    Author: Antoine Pitrou <an...@python.org>
    
    Closes #3222 from pitrou/ARROW-3982-json-simple-binary and squashes the following commits:
    
    5aaa5edc8 <Antoine Pitrou> ARROW-3982:  Allow "binary" input in simple JSON format
---
 cpp/src/arrow/ipc/json-simple-test.cc | 40 +++++++++++++++++++++++++++++++
 cpp/src/arrow/ipc/json-simple.cc      | 45 ++++++++++++++++++++++++++++++++---
 cpp/src/arrow/pretty_print-test.cc    | 11 ++-------
 3 files changed, 84 insertions(+), 12 deletions(-)

diff --git a/cpp/src/arrow/ipc/json-simple-test.cc b/cpp/src/arrow/ipc/json-simple-test.cc
index 84a2210..2e80a0c 100644
--- a/cpp/src/arrow/ipc/json-simple-test.cc
+++ b/cpp/src/arrow/ipc/json-simple-test.cc
@@ -289,6 +289,7 @@ TEST(TestDouble, Errors) {
 }
 
 TEST(TestString, Basics) {
+  // String type
   std::shared_ptr<DataType> type = utf8();
   std::shared_ptr<Array> expected, actual;
 
@@ -300,6 +301,20 @@ TEST(TestString, Basics) {
   s += '\x00';
   s += "char";
   AssertJSONArray<StringType, std::string>(type, "[\"\", \"some\\u0000char\"]", {"", s});
+  // UTF8 sequence in string
+  AssertJSONArray<StringType, std::string>(type, "[\"\xc3\xa9\"]", {"\xc3\xa9"});
+
+  // Binary type
+  type = binary();
+  AssertJSONArray<BinaryType, std::string>(type, "[\"\", \"foo\", null]",
+                                           {true, true, false}, {"", "foo", ""});
+  // Arbitrary binary (non-UTF8) sequence in string
+  s = "\xff\x9f";
+  AssertJSONArray<BinaryType, std::string>(type, "[\"" + s + "\"]", {s});
+  // Bytes < 0x20 can be represented as JSON unicode escapes
+  s = '\x00';
+  s += "\x1f";
+  AssertJSONArray<BinaryType, std::string>(type, "[\"\\u0000\\u001f\"]", {s});
 }
 
 TEST(TestString, Errors) {
@@ -310,6 +325,31 @@ TEST(TestString, Errors) {
   ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[]]", &array));
 }
 
+TEST(TestFixedSizeBinary, Basics) {
+  std::shared_ptr<DataType> type = fixed_size_binary(3);
+  std::shared_ptr<Array> expected, actual;
+
+  AssertJSONArray<FixedSizeBinaryType, std::string>(type, "[]", {});
+  AssertJSONArray<FixedSizeBinaryType, std::string>(type, "[\"foo\", \"bar\"]",
+                                                    {"foo", "bar"});
+  AssertJSONArray<FixedSizeBinaryType, std::string>(type, "[null, \"foo\"]",
+                                                    {false, true}, {"", "foo"});
+  // Arbitrary binary (non-UTF8) sequence in string
+  std::string s = "\xff\x9f\xcc";
+  AssertJSONArray<FixedSizeBinaryType, std::string>(type, "[\"" + s + "\"]", {s});
+}
+
+TEST(TestFixedSizeBinary, Errors) {
+  std::shared_ptr<DataType> type = fixed_size_binary(3);
+  std::shared_ptr<Array> array;
+
+  ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[0]", &array));
+  ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[[]]", &array));
+  // Invalid length
+  ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[\"\"]", &array));
+  ASSERT_RAISES(Invalid, ArrayFromJSON(type, "[\"abcd\"]", &array));
+}
+
 TEST(TestDecimal, Basics) {
   std::shared_ptr<DataType> type = decimal(10, 4);
   std::shared_ptr<Array> expected, actual;
diff --git a/cpp/src/arrow/ipc/json-simple.cc b/cpp/src/arrow/ipc/json-simple.cc
index d812f84..7a78fe4 100644
--- a/cpp/src/arrow/ipc/json-simple.cc
+++ b/cpp/src/arrow/ipc/json-simple.cc
@@ -41,7 +41,8 @@ using ::arrow::internal::checked_cast;
 static constexpr auto kParseFlags = rj::kParseFullPrecisionFlag | rj::kParseNanAndInfFlag;
 
 static Status JSONTypeError(const char* expected_type, rj::Type json_type) {
-  return Status::Invalid("Expected ", expected_type, " or null, got type ", json_type);
+  return Status::Invalid("Expected ", expected_type, " or null, got JSON type ",
+                         json_type);
 }
 
 class Converter {
@@ -91,7 +92,6 @@ class ConcreteConverter : public Converter {
 };
 
 // TODO : dates and times?
-// TODO : binary / fixed size binary?
 
 // ------------------------------------------------------------------------
 // Converter for null arrays
@@ -284,7 +284,7 @@ class DecimalConverter final : public ConcreteConverter<DecimalConverter> {
 };
 
 // ------------------------------------------------------------------------
-// Converter for string arrays
+// Converter for binary and string arrays
 
 class StringConverter final : public ConcreteConverter<StringConverter> {
  public:
@@ -314,6 +314,43 @@ class StringConverter final : public ConcreteConverter<StringConverter> {
 };
 
 // ------------------------------------------------------------------------
+// Converter for fixed-size binary arrays
+
+class FixedSizeBinaryConverter final
+    : public ConcreteConverter<FixedSizeBinaryConverter> {
+ public:
+  explicit FixedSizeBinaryConverter(const std::shared_ptr<DataType>& type) {
+    this->type_ = type;
+    builder_ = std::make_shared<FixedSizeBinaryBuilder>(type, default_memory_pool());
+  }
+
+  Status AppendNull() override { return builder_->AppendNull(); }
+
+  Status AppendValue(const rj::Value& json_obj) override {
+    if (json_obj.IsNull()) {
+      return AppendNull();
+    }
+    if (json_obj.IsString()) {
+      auto view = util::string_view(json_obj.GetString(), json_obj.GetStringLength());
+      if (view.length() != static_cast<size_t>(builder_->byte_width())) {
+        std::stringstream ss;
+        ss << "Invalid string length " << view.length() << " in JSON input for "
+           << this->type_->ToString();
+        return Status::Invalid(ss.str());
+      }
+      return builder_->Append(view);
+    } else {
+      return JSONTypeError("string", json_obj.GetType());
+    }
+  }
+
+  std::shared_ptr<ArrayBuilder> builder() override { return builder_; }
+
+ protected:
+  std::shared_ptr<FixedSizeBinaryBuilder> builder_;
+};
+
+// ------------------------------------------------------------------------
 // Converter for list arrays
 
 class ListConverter final : public ConcreteConverter<ListConverter> {
@@ -449,6 +486,8 @@ Status GetConverter(const std::shared_ptr<DataType>& type,
     SIMPLE_CONVERTER_CASE(Type::LIST, ListConverter)
     SIMPLE_CONVERTER_CASE(Type::STRUCT, StructConverter)
     SIMPLE_CONVERTER_CASE(Type::STRING, StringConverter)
+    SIMPLE_CONVERTER_CASE(Type::BINARY, StringConverter)
+    SIMPLE_CONVERTER_CASE(Type::FIXED_SIZE_BINARY, FixedSizeBinaryConverter)
     SIMPLE_CONVERTER_CASE(Type::DECIMAL, DecimalConverter)
     default: {
       return Status::NotImplemented("JSON conversion to ", type->ToString(),
diff --git a/cpp/src/arrow/pretty_print-test.cc b/cpp/src/arrow/pretty_print-test.cc
index a1acfb8..8696efc 100644
--- a/cpp/src/arrow/pretty_print-test.cc
+++ b/cpp/src/arrow/pretty_print-test.cc
@@ -277,18 +277,11 @@ TEST_F(TestPrettyPrint, ListType) {
 
 TEST_F(TestPrettyPrint, FixedSizeBinaryType) {
   std::vector<bool> is_valid = {true, true, false, true, false};
-  std::vector<std::string> values = {"foo", "bar", "baz"};
 
-  std::shared_ptr<Array> array;
   auto type = fixed_size_binary(3);
-  FixedSizeBinaryBuilder builder(type);
-
-  ASSERT_OK(builder.Append(values[0]));
-  ASSERT_OK(builder.Append(values[1]));
-  ASSERT_OK(builder.Append(values[2]));
-  ASSERT_OK(builder.Finish(&array));
+  auto array = ArrayFromJSON(type, "[\"foo\", \"bar\", null, \"baz\"]");
 
-  static const char* ex = "[\n  666F6F,\n  626172,\n  62617A\n]";
+  static const char* ex = "[\n  666F6F,\n  626172,\n  null,\n  62617A\n]";
   CheckArray(*array, {0, 10}, ex);
   static const char* ex_2 = "  [\n    666F6F,\n    ...\n    62617A\n  ]";
   CheckArray(*array, {2, 1}, ex_2);