You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by gi...@apache.org on 2024/01/10 01:29:52 UTC
(arrow-nanoarrow) branch main updated: Update dist/ for commit 6523f511799537ac1dc3b210d8e804caff02edd9
This is an automated email from the ASF dual-hosted git repository.
github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-nanoarrow.git
The following commit(s) were added to refs/heads/main by this push:
new a8056cb Update dist/ for commit 6523f511799537ac1dc3b210d8e804caff02edd9
a8056cb is described below
commit a8056cbc5f1fe557a48ccaa362127dad60f9249d
Author: GitHub Actions <ac...@github.com>
AuthorDate: Wed Jan 10 01:29:47 2024 +0000
Update dist/ for commit 6523f511799537ac1dc3b210d8e804caff02edd9
---
dist/nanoarrow.c | 4 +-
dist/nanoarrow.hpp | 10 +-
dist/nanoarrow_ipc.c | 3 +
dist/nanoarrow_testing.hpp | 467 ++++++++++++++++++++++++++++++++++++++++-----
4 files changed, 428 insertions(+), 56 deletions(-)
diff --git a/dist/nanoarrow.c b/dist/nanoarrow.c
index a54e7ab..18307c0 100644
--- a/dist/nanoarrow.c
+++ b/dist/nanoarrow.c
@@ -2922,8 +2922,8 @@ static int ArrowArrayViewValidateFull(struct ArrowArrayView* array_view,
// Dictionary valiation not implemented
if (array_view->dictionary != NULL) {
- ArrowErrorSet(error, "Validation for dictionary-encoded arrays is not implemented");
- return ENOTSUP;
+ NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateFull(array_view->dictionary, error));
+ // TODO: validate the indices
}
return NANOARROW_OK;
diff --git a/dist/nanoarrow.hpp b/dist/nanoarrow.hpp
index 15914ce..8d5b841 100644
--- a/dist/nanoarrow.hpp
+++ b/dist/nanoarrow.hpp
@@ -206,15 +206,21 @@ class Unique {
/// \brief Move and take ownership of data wrapped by rhs
Unique(Unique&& rhs) : Unique(rhs.get()) {}
+ Unique& operator=(Unique&& rhs) {
+ reset(rhs.get());
+ return *this;
+ }
// These objects are not copyable
- Unique(Unique& rhs) = delete;
+ Unique(const Unique& rhs) = delete;
/// \brief Get a pointer to the data owned by this object
T* get() noexcept { return &data_; }
+ const T* get() const noexcept { return &data_; }
/// \brief Use the pointer operator to access fields of this object
- T* operator->() { return &data_; }
+ T* operator->() noexcept { return &data_; }
+ const T* operator->() const noexcept { return &data_; }
/// \brief Call data's release callback if valid
void reset() { release_pointer(&data_); }
diff --git a/dist/nanoarrow_ipc.c b/dist/nanoarrow_ipc.c
index 54e7468..0696883 100644
--- a/dist/nanoarrow_ipc.c
+++ b/dist/nanoarrow_ipc.c
@@ -21394,6 +21394,9 @@ ArrowErrorCode ArrowIpcDecoderDecodeSchema(struct ArrowIpcDecoder* decoder,
return result;
}
+ // Top-level batch schema is typically non-nullable
+ tmp.flags = 0;
+
result = ArrowIpcDecoderSetChildren(&tmp, fields, error);
if (result != NANOARROW_OK) {
ArrowSchemaRelease(&tmp);
diff --git a/dist/nanoarrow_testing.hpp b/dist/nanoarrow_testing.hpp
index b62c3ca..951c215 100644
--- a/dist/nanoarrow_testing.hpp
+++ b/dist/nanoarrow_testing.hpp
@@ -15,10 +15,12 @@
// specific language governing permissions and limitations
// under the License.
+#include <algorithm>
#include <iostream>
#include <limits>
#include <sstream>
#include <string>
+#include <unordered_map>
#include <nlohmann/json.hpp>
@@ -35,6 +37,96 @@ namespace nanoarrow {
namespace testing {
+namespace internal {
+
+// Internal representation of the various structures needed to import and/or export
+// a dictionary array. We use a serialized version of the dictionary value because
+// nanoarrow doesn't currently have the ability to copy or reference count an Array.
+struct Dictionary {
+ nanoarrow::UniqueSchema schema;
+ int64_t column_length;
+ std::string column_json;
+};
+
+class DictionaryContext {
+ public:
+ DictionaryContext() : next_id_(0) {}
+
+ ArrowErrorCode RecordSchema(int32_t dictionary_id, const ArrowSchema* values_schema) {
+ if (!HasDictionaryForId(dictionary_id)) {
+ dictionaries_[dictionary_id] = internal::Dictionary();
+ NANOARROW_RETURN_NOT_OK(
+ ArrowSchemaDeepCopy(values_schema, dictionaries_[dictionary_id].schema.get()));
+ }
+
+ dictionary_ids_[values_schema] = dictionary_id;
+ return NANOARROW_OK;
+ }
+
+ ArrowErrorCode RecordSchema(const ArrowSchema* values_schema, int32_t* dictionary_id) {
+ while (HasDictionaryForId(next_id_)) {
+ next_id_++;
+ }
+
+ NANOARROW_RETURN_NOT_OK(RecordSchema(next_id_, values_schema));
+ *dictionary_id = next_id_++;
+ return NANOARROW_OK;
+ }
+
+ void RecordArray(int32_t dictionary_id, int64_t length, std::string column_json) {
+ dictionaries_[dictionary_id].column_length = length;
+ dictionaries_[dictionary_id].column_json = std::move(column_json);
+ }
+
+ void RecordArray(const ArrowSchema* values_schema, int64_t length,
+ std::string column_json) {
+ auto ids_it = dictionary_ids_.find(values_schema);
+ RecordArray(ids_it->second, length, column_json);
+ }
+
+ bool empty() { return dictionaries_.empty(); }
+
+ void clear() {
+ dictionaries_.clear();
+ dictionary_ids_.clear();
+ next_id_ = 0;
+ }
+
+ bool HasDictionaryForSchema(const ArrowSchema* values_schema) const {
+ return dictionary_ids_.find(values_schema) != dictionary_ids_.end();
+ }
+
+ bool HasDictionaryForId(int32_t dictionary_id) const {
+ return dictionaries_.find(dictionary_id) != dictionaries_.end();
+ }
+
+ const Dictionary& Get(int32_t dictionary_id) const {
+ auto dict_it = dictionaries_.find(dictionary_id);
+ return dict_it->second;
+ }
+
+ const Dictionary& Get(const ArrowSchema* values_schema) const {
+ auto ids_it = dictionary_ids_.find(values_schema);
+ return Get(ids_it->second);
+ }
+
+ const std::vector<int32_t> GetAllIds() const {
+ std::vector<int32_t> out;
+ out.reserve(dictionaries_.size());
+ for (const auto& value : dictionaries_) {
+ out.push_back(value.first);
+ }
+ return out;
+ }
+
+ private:
+ int32_t next_id_;
+ std::unordered_map<int32_t, Dictionary> dictionaries_;
+ std::unordered_map<const ArrowSchema*, int32_t> dictionary_ids_;
+};
+
+} // namespace internal
+
/// \defgroup nanoarrow_testing-json Integration test helpers
///
/// See testing format documentation for details of the JSON representation. This
@@ -56,6 +148,8 @@ class TestingJSONWriter {
/// avoid serialization issues.
void set_float_precision(int precision) { float_precision_ = precision; }
+ void ResetDictionaries() { dictionaries_.clear(); }
+
/// \brief Write an ArrowArrayStream as a data file JSON object to out
///
/// Creates output like `{"schema": {...}, "batches": [...], ...}`.
@@ -64,6 +158,8 @@ class TestingJSONWriter {
return EINVAL;
}
+ ResetDictionaries();
+
out << R"({"schema": )";
nanoarrow::UniqueSchema schema;
@@ -93,7 +189,14 @@ class TestingJSONWriter {
array.reset();
} while (true);
- out << "]}";
+ out << "]";
+
+ if (!dictionaries_.empty()) {
+ out << R"(, "dictionaries": )";
+ NANOARROW_RETURN_NOT_OK(WriteDictionaryBatches(out));
+ }
+
+ out << "}";
return NANOARROW_OK;
}
@@ -138,7 +241,7 @@ class TestingJSONWriter {
/// Creates output like `{"name" : "col", "type": {...}, ...}`
ArrowErrorCode WriteField(std::ostream& out, const ArrowSchema* field) {
ArrowSchemaView view;
- NANOARROW_RETURN_NOT_OK(ArrowSchemaViewInit(&view, (ArrowSchema*)field, nullptr));
+ NANOARROW_RETURN_NOT_OK(ArrowSchemaViewInit(&view, field, nullptr));
out << "{";
@@ -157,25 +260,37 @@ class TestingJSONWriter {
out << R"(, "nullable": false)";
}
- // Write type
- out << R"(, "type": )";
- NANOARROW_RETURN_NOT_OK(WriteType(out, &view));
+ // For dictionary encoding, write type as the dictionary (values) type,
+ // record the dictionary schema, and write the "dictionary" member
+ if (field->dictionary != nullptr) {
+ ArrowSchemaView dictionary_view;
+ NANOARROW_RETURN_NOT_OK(
+ ArrowSchemaViewInit(&dictionary_view, field->dictionary, nullptr));
- // Write children
- out << R"(, "children": )";
- if (field->n_children == 0) {
- out << "[]";
+ out << R"(, "type": )";
+ NANOARROW_RETURN_NOT_OK(WriteType(out, &dictionary_view));
+
+ int32_t dictionary_id;
+ NANOARROW_RETURN_NOT_OK(
+ dictionaries_.RecordSchema(field->dictionary, &dictionary_id));
+
+ out << R"(, "dictionary": )";
+ view.type = view.storage_type;
+ NANOARROW_RETURN_NOT_OK(WriteFieldDictionary(
+ out, dictionary_id, field->flags & ARROW_FLAG_DICTIONARY_ORDERED, &view));
+
+ // Write dictionary children
+ out << R"(, "children": )";
+ NANOARROW_RETURN_NOT_OK(WriteFieldChildren(out, field->dictionary));
} else {
- out << "[";
- NANOARROW_RETURN_NOT_OK(WriteField(out, field->children[0]));
- for (int64_t i = 1; i < field->n_children; i++) {
- out << ", ";
- NANOARROW_RETURN_NOT_OK(WriteField(out, field->children[i]));
- }
- out << "]";
- }
+ // Write non-dictionary type/children
+ out << R"(, "type": )";
+ NANOARROW_RETURN_NOT_OK(WriteType(out, &view));
- // TODO: Dictionary (currently fails at WriteType)
+ // Write children
+ out << R"(, "children": )";
+ NANOARROW_RETURN_NOT_OK(WriteFieldChildren(out, field));
+ }
// Write metadata
if (field->metadata != nullptr) {
@@ -341,20 +456,70 @@ class TestingJSONWriter {
}
out << "}";
+
+ // Write the dictionary values to the DictionaryContext for later if applicable
+ if (field->dictionary != nullptr) {
+ if (!dictionaries_.HasDictionaryForSchema(field->dictionary)) {
+ return EINVAL;
+ }
+
+ std::stringstream dictionary_output;
+ NANOARROW_RETURN_NOT_OK(
+ WriteColumn(dictionary_output, field->dictionary, value->dictionary));
+ dictionaries_.RecordArray(field->dictionary, value->dictionary->length,
+ std::move(dictionary_output.str()));
+ }
+
+ return NANOARROW_OK;
+ }
+
+ ArrowErrorCode WriteDictionaryBatches(std::ostream& out) {
+ std::vector<int32_t> ids = dictionaries_.GetAllIds();
+ if (ids.empty()) {
+ out << "[]";
+ return NANOARROW_OK;
+ }
+
+ out << "[";
+ std::sort(ids.begin(), ids.end());
+ NANOARROW_RETURN_NOT_OK(WriteDictionaryBatch(out, ids[0]));
+ for (size_t i = 1; i < ids.size(); i++) {
+ out << ", ";
+ NANOARROW_RETURN_NOT_OK(WriteDictionaryBatch(out, ids[i]));
+ }
+ out << "]";
+
return NANOARROW_OK;
}
private:
int float_precision_;
+ internal::DictionaryContext dictionaries_;
- ArrowErrorCode WriteType(std::ostream& out, const ArrowSchemaView* field) {
- ArrowType type;
- if (field->extension_name.data != nullptr) {
- type = field->storage_type;
+ ArrowErrorCode WriteDictionaryBatch(std::ostream& out, int32_t dictionary_id) {
+ const internal::Dictionary& dict = dictionaries_.Get(dictionary_id);
+ out << R"({"id": )" << dictionary_id << R"(, "data": {"count": )"
+ << dict.column_length << R"(, "columns": [)" << dict.column_json << "]}}";
+ return NANOARROW_OK;
+ }
+
+ ArrowErrorCode WriteFieldChildren(std::ostream& out, const ArrowSchema* field) {
+ if (field->n_children == 0) {
+ out << "[]";
} else {
- type = field->type;
+ out << "[";
+ NANOARROW_RETURN_NOT_OK(WriteField(out, field->children[0]));
+ for (int64_t i = 1; i < field->n_children; i++) {
+ out << ", ";
+ NANOARROW_RETURN_NOT_OK(WriteField(out, field->children[i]));
+ }
+ out << "]";
}
+ return NANOARROW_OK;
+ }
+
+ ArrowErrorCode WriteType(std::ostream& out, const ArrowSchemaView* field) {
out << "{";
switch (field->type) {
@@ -447,6 +612,26 @@ class TestingJSONWriter {
return NANOARROW_OK;
}
+ ArrowErrorCode WriteFieldDictionary(std::ostream& out, int32_t dictionary_id,
+ bool is_ordered,
+ const ArrowSchemaView* indices_field) {
+ out << "{";
+
+ out << R"("id": )" << dictionary_id;
+
+ out << R"(, "indexType": )";
+ NANOARROW_RETURN_NOT_OK(WriteType(out, indices_field));
+
+ if (is_ordered) {
+ out << R"(, "isOrdered": true)";
+ } else {
+ out << R"(, "isOrdered": false)";
+ }
+
+ out << "}";
+ return NANOARROW_OK;
+ }
+
ArrowErrorCode WriteMetadataItem(std::ostream& out, ArrowMetadataReader* reader) {
ArrowStringView key;
ArrowStringView value;
@@ -746,6 +931,8 @@ class TestingJSONReader {
ArrowErrorCode ReadDataFile(const std::string& data_file_json, ArrowArrayStream* out,
int num_batch = kNumBatchReadAll,
ArrowError* error = nullptr) {
+ dictionaries_.clear();
+
try {
auto obj = json::parse(data_file_json);
NANOARROW_RETURN_NOT_OK(Check(obj.is_object(), error, "data file must be object"));
@@ -767,6 +954,11 @@ class TestingJSONReader {
NANOARROW_RETURN_NOT_OK(
ArrowArrayViewInitFromSchema(array_view.get(), schema.get(), error));
+ // Record any dictionaries that might be present
+ if (obj.contains("dictionaries")) {
+ NANOARROW_RETURN_NOT_OK(RecordDictionaryBatches(obj["dictionaries"], error));
+ }
+
// Get a vector of batch ids to parse
std::vector<size_t> batch_ids;
if (num_batch == kNumBatchOnlySchema) {
@@ -793,8 +985,8 @@ class TestingJSONReader {
NANOARROW_RETURN_NOT_OK(
ArrowArrayInitFromArrayView(array.get(), array_view.get(), error));
SetArrayAllocatorRecursive(array.get());
- NANOARROW_RETURN_NOT_OK(
- SetArrayBatch(batches[batch_ids[i]], array_view.get(), array.get(), error));
+ NANOARROW_RETURN_NOT_OK(SetArrayBatch(batches[batch_ids[i]], schema.get(),
+ array_view.get(), array.get(), error));
ArrowBasicArrayStreamSetArray(stream.get(), i, array.get());
}
@@ -864,7 +1056,8 @@ class TestingJSONReader {
NANOARROW_RETURN_NOT_OK(ArrowArrayInitFromSchema(array.get(), schema, error));
SetArrayAllocatorRecursive(array.get());
- NANOARROW_RETURN_NOT_OK(SetArrayBatch(obj, array_view.get(), array.get(), error));
+ NANOARROW_RETURN_NOT_OK(
+ SetArrayBatch(obj, schema, array_view.get(), array.get(), error));
ArrowArrayMove(array.get(), out);
return NANOARROW_OK;
} catch (json::exception& e) {
@@ -894,7 +1087,8 @@ class TestingJSONReader {
SetArrayAllocatorRecursive(array.get());
// Parse the JSON into the array
- NANOARROW_RETURN_NOT_OK(SetArrayColumn(obj, array_view.get(), array.get(), error));
+ NANOARROW_RETURN_NOT_OK(
+ SetArrayColumn(obj, schema, array_view.get(), array.get(), error));
// Return the result
ArrowArrayMove(array.get(), out);
@@ -907,6 +1101,7 @@ class TestingJSONReader {
private:
ArrowBufferAllocator allocator_;
+ internal::DictionaryContext dictionaries_;
ArrowErrorCode SetSchema(ArrowSchema* schema, const json& value, ArrowError* error) {
NANOARROW_RETURN_NOT_OK(
@@ -917,6 +1112,9 @@ class TestingJSONReader {
NANOARROW_RETURN_NOT_OK_WITH_ERROR(
ArrowSchemaInitFromType(schema, NANOARROW_TYPE_STRUCT), error);
+ // Top-level schema is non-nullable
+ schema->flags = 0;
+
const auto& fields = value["fields"];
NANOARROW_RETURN_NOT_OK(
Check(fields.is_array(), error, "Schema fields must be array"));
@@ -939,17 +1137,18 @@ class TestingJSONReader {
ArrowErrorCode SetField(ArrowSchema* schema, const json& value, ArrowError* error) {
NANOARROW_RETURN_NOT_OK(
Check(value.is_object(), error, "Expected Field to be a JSON object"));
+ ArrowSchemaInit(schema);
+
NANOARROW_RETURN_NOT_OK(
Check(value.contains("name"), error, "Field missing key 'name'"));
- NANOARROW_RETURN_NOT_OK(
- Check(value.contains("nullable"), error, "Field missing key 'nullable'"));
NANOARROW_RETURN_NOT_OK(
Check(value.contains("type"), error, "Field missing key 'type'"));
+ NANOARROW_RETURN_NOT_OK(
+ Check(value.contains("nullable"), error, "Field missing key 'nullable'"));
NANOARROW_RETURN_NOT_OK(
Check(value.contains("children"), error, "Field missing key 'children'"));
- ArrowSchemaInit(schema);
-
+ // Name
const auto& name = value["name"];
NANOARROW_RETURN_NOT_OK(Check(name.is_string() || name.is_null(), error,
"Field name must be string or null"));
@@ -959,6 +1158,7 @@ class TestingJSONReader {
error);
}
+ // Nullability
const auto& nullable = value["nullable"];
NANOARROW_RETURN_NOT_OK(
Check(nullable.is_boolean(), error, "Field nullable must be boolean"));
@@ -968,6 +1168,38 @@ class TestingJSONReader {
schema->flags &= ~ARROW_FLAG_NULLABLE;
}
+ // Metadata
+ if (value.contains("metadata")) {
+ NANOARROW_RETURN_NOT_OK(SetMetadata(schema, value["metadata"], error));
+ }
+
+ // If we have a dictionary, this value needs to be in schema->dictionary
+ // and value["dictionary"] needs to be in schema
+ if (value.contains("dictionary")) {
+ // Put the index type in this schema
+ int32_t dictionary_id;
+ NANOARROW_RETURN_NOT_OK(
+ SetDictionary(schema, value["dictionary"], &dictionary_id, error));
+
+ // Allocate a dictionary and put this value (minus dictionary, metadata, and name)
+ json value_copy = value;
+ value_copy.erase("dictionary");
+ value_copy.erase("metadata");
+ value_copy["name"] = nullptr;
+ NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowSchemaAllocateDictionary(schema), error);
+ NANOARROW_RETURN_NOT_OK(SetField(schema->dictionary, value_copy, error));
+
+ // Keep track of this dictionary_id/schema for parsing batches
+ NANOARROW_RETURN_NOT_OK_WITH_ERROR(
+ dictionaries_.RecordSchema(dictionary_id, schema->dictionary), error);
+
+ // Validate!
+ ArrowSchemaView schema_view;
+ NANOARROW_RETURN_NOT_OK(ArrowSchemaViewInit(&schema_view, schema, error));
+
+ return NANOARROW_OK;
+ }
+
NANOARROW_RETURN_NOT_OK(SetType(schema, value["type"], error));
const auto& children = value["children"];
@@ -979,16 +1211,43 @@ class TestingJSONReader {
NANOARROW_RETURN_NOT_OK(SetField(schema->children[i], children[i], error));
}
- if (value.contains("metadata")) {
- NANOARROW_RETURN_NOT_OK(SetMetadata(schema, value["metadata"], error));
- }
-
// Validate!
ArrowSchemaView schema_view;
NANOARROW_RETURN_NOT_OK(ArrowSchemaViewInit(&schema_view, schema, error));
return NANOARROW_OK;
}
+ ArrowErrorCode SetDictionary(ArrowSchema* schema, const json& value,
+ int32_t* dictionary_id, ArrowError* error) {
+ NANOARROW_RETURN_NOT_OK(Check(value.is_object(), error, "Dictionary must be object"));
+ NANOARROW_RETURN_NOT_OK(
+ Check(value.contains("id"), error, "Dictionary missing key 'id'"));
+ NANOARROW_RETURN_NOT_OK(
+ Check(value.contains("indexType"), error, "Dictionary missing key 'type'"));
+ NANOARROW_RETURN_NOT_OK(
+ Check(value.contains("isOrdered"), error, "Dictionary missing key 'isOrdered'"));
+
+ const auto& id = value["id"];
+ NANOARROW_RETURN_NOT_OK(
+ Check(id.is_number_integer(), error, "Dictionary id must be integer"));
+ *dictionary_id = id.get<int32_t>();
+
+ // Parse the index type
+ NANOARROW_RETURN_NOT_OK(SetType(schema, value["indexType"], error));
+
+ // Set the flag
+ const auto& is_ordered = value["isOrdered"];
+ NANOARROW_RETURN_NOT_OK(
+ Check(is_ordered.is_boolean(), error, "Dictionary isOrdered must be bool"));
+ if (is_ordered.get<bool>()) {
+ schema->flags |= ARROW_FLAG_DICTIONARY_ORDERED;
+ } else {
+ schema->flags &= ~ARROW_FLAG_DICTIONARY_ORDERED;
+ }
+
+ return NANOARROW_OK;
+ }
+
ArrowErrorCode SetType(ArrowSchema* schema, const json& value, ArrowError* error) {
NANOARROW_RETURN_NOT_OK(Check(value.is_object(), error, "Type must be object"));
NANOARROW_RETURN_NOT_OK(
@@ -1314,8 +1573,9 @@ class TestingJSONReader {
return NANOARROW_OK;
}
- ArrowErrorCode SetArrayBatch(const json& value, ArrowArrayView* array_view,
- ArrowArray* array, ArrowError* error) {
+ ArrowErrorCode SetArrayBatch(const json& value, const ArrowSchema* schema,
+ ArrowArrayView* array_view, ArrowArray* array,
+ ArrowError* error) {
NANOARROW_RETURN_NOT_OK(
Check(value.is_object(), error, "Expected RecordBatch to be a JSON object"));
@@ -1337,8 +1597,9 @@ class TestingJSONReader {
"RecordBatch children has incorrect size"));
for (int64_t i = 0; i < array_view->n_children; i++) {
- NANOARROW_RETURN_NOT_OK(
- SetArrayColumn(columns[i], array_view->children[i], array->children[i], error));
+ NANOARROW_RETURN_NOT_OK(SetArrayColumn(columns[i], schema->children[i],
+ array_view->children[i], array->children[i],
+ error));
}
// Validate the array view
@@ -1354,8 +1615,56 @@ class TestingJSONReader {
return NANOARROW_OK;
}
- ArrowErrorCode SetArrayColumn(const json& value, ArrowArrayView* array_view,
- ArrowArray* array, ArrowError* error,
+ ArrowErrorCode RecordDictionaryBatches(const json& value, ArrowError* error) {
+ NANOARROW_RETURN_NOT_OK(Check(value.is_array(), error, "dictionaries must be array"));
+
+ for (const auto& batch : value) {
+ NANOARROW_RETURN_NOT_OK(RecordDictionaryBatch(batch, error));
+ }
+
+ return NANOARROW_OK;
+ }
+
+ ArrowErrorCode RecordDictionaryBatch(const json& value, ArrowError* error) {
+ NANOARROW_RETURN_NOT_OK(
+ Check(value.is_object(), error, "dictionary batch must be object"));
+ NANOARROW_RETURN_NOT_OK(
+ Check(value.contains("id"), error, "dictionary batch missing key 'id'"));
+ NANOARROW_RETURN_NOT_OK(
+ Check(value.contains("data"), error, "dictionary batch missing key 'data'"));
+
+ const auto& id = value["id"];
+ NANOARROW_RETURN_NOT_OK(
+ Check(id.is_number_integer(), error, "dictionary batch id must be integer"));
+ int id_int = id.get<int>();
+ NANOARROW_RETURN_NOT_OK(Check(dictionaries_.HasDictionaryForId(id_int), error,
+ "dictionary batch has unknown id"));
+
+ const auto& batch = value["data"];
+ NANOARROW_RETURN_NOT_OK(
+ Check(batch.is_object(), error, "dictionary batch data must be object"));
+ NANOARROW_RETURN_NOT_OK(Check(batch.contains("columns"), error,
+ "dictionary batch missing key 'columns'"));
+ NANOARROW_RETURN_NOT_OK(
+ Check(batch.contains("count"), error, "dictionary batch missing key 'count'"));
+
+ const auto& batch_columns = batch["columns"];
+ NANOARROW_RETURN_NOT_OK(Check(batch_columns.is_array() && batch_columns.size() == 1,
+ error,
+ "dictionary batch columns must be array of size 1"));
+
+ const auto& batch_count = batch["count"];
+ NANOARROW_RETURN_NOT_OK(Check(batch_count.is_number_integer(), error,
+ "dictionary batch count must be integer"));
+
+ dictionaries_.RecordArray(id_int, batch_count.get<int32_t>(),
+ batch_columns[0].dump());
+ return NANOARROW_OK;
+ }
+
+ ArrowErrorCode SetArrayColumn(const json& value, const ArrowSchema* schema,
+ ArrowArrayView* array_view, ArrowArray* array,
+ ArrowError* error,
const std::string& parent_error_prefix = "") {
NANOARROW_RETURN_NOT_OK(
Check(value.is_object(), error, "Expected Column to be a JSON object"));
@@ -1388,7 +1697,8 @@ class TestingJSONReader {
error_prefix + "children has incorrect size"));
for (int64_t i = 0; i < array_view->n_children; i++) {
- NANOARROW_RETURN_NOT_OK(SetArrayColumn(children[i], array_view->children[i],
+ NANOARROW_RETURN_NOT_OK(SetArrayColumn(children[i], schema->children[i],
+ array_view->children[i],
array->children[i], error, error_prefix));
}
}
@@ -1416,6 +1726,28 @@ class TestingJSONReader {
ArrowBufferView* buffer_view = array_view->buffer_views + i;
buffer_view->data.as_uint8 = buffer->data;
buffer_view->size_bytes = buffer->size_bytes;
+
+ // If this is a validity buffer with a big enough size, set the array_view's
+ // null_count
+ if (array_view->layout.buffer_type[i] == NANOARROW_BUFFER_TYPE_VALIDITY &&
+ _ArrowBytesForBits(array_view->length) <= buffer_view->size_bytes) {
+ array_view->null_count =
+ array_view->length -
+ ArrowBitCountSet(buffer_view->data.as_uint8, 0, array_view->length);
+ }
+ }
+
+ // If there is a dictionary associated with schema, parse its value into dictionary
+ if (schema->dictionary != nullptr) {
+ NANOARROW_RETURN_NOT_OK(Check(
+ dictionaries_.HasDictionaryForSchema(schema->dictionary), error,
+ error_prefix +
+ "dictionary could not be resolved from dictionary id in SetArrayColumn()"));
+
+ const internal::Dictionary& dict = dictionaries_.Get(schema->dictionary);
+ NANOARROW_RETURN_NOT_OK(SetArrayColumn(
+ json::parse(dict.column_json), schema->dictionary, array_view->dictionary,
+ array->dictionary, error, error_prefix + "-> <dictionary> "));
}
// Validate the array view
@@ -1424,9 +1756,10 @@ class TestingJSONReader {
error_prefix + "failed to validate: "));
// Flush length and buffer pointers to the Array
- array->length = array_view->length;
NANOARROW_RETURN_NOT_OK_WITH_ERROR(
ArrowArrayFinishBuilding(array, NANOARROW_VALIDATION_LEVEL_NONE, nullptr), error);
+ array->length = array_view->length;
+ array->null_count = array_view->null_count;
return NANOARROW_OK;
}
@@ -1885,6 +2218,9 @@ class TestingJSONComparison {
ArrowErrorCode CompareSchema(const ArrowSchema* actual, const ArrowSchema* expected,
ArrowError* error = nullptr,
const std::string& path = "") {
+ writer_actual_.ResetDictionaries();
+ writer_expected_.ResetDictionaries();
+
// Compare the top-level schema "manually" because (1) map type needs special-cased
// comparison and (2) it's easier to read the output if differences are separated
// by field.
@@ -1926,13 +2262,13 @@ class TestingJSONComparison {
// Compare metadata
std::stringstream ss;
- NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_.WriteMetadata(ss, actual->metadata),
+ NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_actual_.WriteMetadata(ss, actual->metadata),
error);
std::string actual_metadata = ss.str();
ss.str("");
- NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_.WriteMetadata(ss, expected->metadata),
- error);
+ NANOARROW_RETURN_NOT_OK_WITH_ERROR(
+ writer_expected_.WriteMetadata(ss, expected->metadata), error);
std::string expected_metadata = ss.str();
if (actual_metadata != expected_metadata) {
@@ -1960,6 +2296,14 @@ class TestingJSONComparison {
return EINVAL;
}
+ // "Write" the schema using both writers to ensure dictionary ids can be resolved
+ // using the ArrowSchema* pointers from schema_
+ std::stringstream ss;
+ writer_actual_.ResetDictionaries();
+ writer_expected_.ResetDictionaries();
+ writer_actual_.WriteSchema(ss, schema_.get());
+ writer_expected_.WriteSchema(ss, schema_.get());
+
return NANOARROW_OK;
}
@@ -1993,7 +2337,8 @@ class TestingJSONComparison {
}
private:
- TestingJSONWriter writer_;
+ TestingJSONWriter writer_actual_;
+ TestingJSONWriter writer_expected_;
std::vector<Difference> differences_;
nanoarrow::UniqueSchema schema_;
nanoarrow::UniqueArrayView actual_;
@@ -2019,11 +2364,11 @@ class TestingJSONComparison {
ArrowError* error, const std::string& path = "") {
std::stringstream ss;
- NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_.WriteField(ss, expected), error);
+ NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_expected_.WriteField(ss, expected), error);
std::string expected_json = ss.str();
ss.str("");
- NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_.WriteField(ss, actual), error);
+ NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_actual_.WriteField(ss, actual), error);
std::string actual_json = ss.str();
if (actual_json != expected_json) {
@@ -2036,13 +2381,31 @@ class TestingJSONComparison {
ArrowErrorCode CompareColumn(ArrowSchema* schema, ArrowArrayView* actual,
ArrowArrayView* expected, ArrowError* error,
const std::string& path = "") {
- std::stringstream ss;
+ // Compare children and dictionaries first, then higher-level structures after.
+ // This is a redundant because the higher-level serialized JSON will also report
+ // a difference if deeply nested children have differences; however, it will not
+ // contain dictionaries and this output is slightly better (more targeted differences
+ // that are slightly easier to read appear first).
+ for (int64_t i = 0; i < schema->n_children; i++) {
+ NANOARROW_RETURN_NOT_OK(
+ CompareColumn(schema->children[i], actual->children[i], expected->children[i],
+ error, path + ".children[" + std::to_string(i) + "]"));
+ }
- NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_.WriteColumn(ss, schema, expected), error);
+ if (schema->dictionary != nullptr) {
+ NANOARROW_RETURN_NOT_OK(CompareColumn(schema->dictionary, actual->dictionary,
+ expected->dictionary, error,
+ path + ".dictionary"));
+ }
+
+ std::stringstream ss;
+ NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_expected_.WriteColumn(ss, schema, expected),
+ error);
std::string expected_json = ss.str();
ss.str("");
- NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_.WriteColumn(ss, schema, actual), error);
+ NANOARROW_RETURN_NOT_OK_WITH_ERROR(writer_actual_.WriteColumn(ss, schema, actual),
+ error);
std::string actual_json = ss.str();
if (actual_json != expected_json) {