You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2016/11/18 19:58:59 UTC
[1/2] arrow git commit: ARROW-373: [C++] JSON serialization format
for testing
Repository: arrow
Updated Branches:
refs/heads/master 841709627 -> ed6ec3b76
http://git-wip-us.apache.org/repos/asf/arrow/blob/ed6ec3b7/cpp/src/arrow/type.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc
index 4fd50b7..589bdad 100644
--- a/cpp/src/arrow/type.cc
+++ b/cpp/src/arrow/type.cc
@@ -20,6 +20,8 @@
#include <sstream>
#include <string>
+#include "arrow/util/status.h"
+
namespace arrow {
std::string Field::ToString() const {
@@ -44,9 +46,24 @@ bool DataType::Equals(const DataType* other) const {
return equals;
}
+std::string BooleanType::ToString() const {
+ return name();
+}
+
+FloatingPointMeta::Precision HalfFloatType::precision() const {
+ return FloatingPointMeta::HALF;
+}
+
+FloatingPointMeta::Precision FloatType::precision() const {
+ return FloatingPointMeta::SINGLE;
+}
+
+FloatingPointMeta::Precision DoubleType::precision() const {
+ return FloatingPointMeta::DOUBLE;
+}
+
std::string StringType::ToString() const {
- std::string result(name());
- return result;
+ return std::string("string");
}
std::string ListType::ToString() const {
@@ -56,7 +73,7 @@ std::string ListType::ToString() const {
}
std::string BinaryType::ToString() const {
- return std::string(name());
+ return std::string("binary");
}
std::string StructType::ToString() const {
@@ -71,4 +88,103 @@ std::string StructType::ToString() const {
return s.str();
}
+std::string UnionType::ToString() const {
+ std::stringstream s;
+
+ if (mode == UnionMode::SPARSE) {
+ s << "union[sparse]<";
+ } else {
+ s << "union[dense]<";
+ }
+
+ for (size_t i = 0; i < children_.size(); ++i) {
+ if (i) { s << ", "; }
+ s << children_[i]->ToString();
+ }
+ s << ">";
+ return s.str();
+}
+
+int NullType::bit_width() const {
+ return 0;
+}
+
+std::string NullType::ToString() const {
+ return name();
+}
+
+// Visitors and template instantiation
+
+#define ACCEPT_VISITOR(TYPE) \
+ Status TYPE::Accept(TypeVisitor* visitor) const { return visitor->Visit(*this); }
+
+ACCEPT_VISITOR(NullType);
+ACCEPT_VISITOR(BooleanType);
+ACCEPT_VISITOR(BinaryType);
+ACCEPT_VISITOR(StringType);
+ACCEPT_VISITOR(ListType);
+ACCEPT_VISITOR(StructType);
+ACCEPT_VISITOR(DecimalType);
+ACCEPT_VISITOR(UnionType);
+ACCEPT_VISITOR(DateType);
+ACCEPT_VISITOR(TimeType);
+ACCEPT_VISITOR(TimestampType);
+ACCEPT_VISITOR(IntervalType);
+
+#define TYPE_FACTORY(NAME, KLASS) \
+ std::shared_ptr<DataType> NAME() { \
+ static std::shared_ptr<DataType> result = std::make_shared<KLASS>(); \
+ return result; \
+ }
+
+TYPE_FACTORY(null, NullType);
+TYPE_FACTORY(boolean, BooleanType);
+TYPE_FACTORY(int8, Int8Type);
+TYPE_FACTORY(uint8, UInt8Type);
+TYPE_FACTORY(int16, Int16Type);
+TYPE_FACTORY(uint16, UInt16Type);
+TYPE_FACTORY(int32, Int32Type);
+TYPE_FACTORY(uint32, UInt32Type);
+TYPE_FACTORY(int64, Int64Type);
+TYPE_FACTORY(uint64, UInt64Type);
+TYPE_FACTORY(float16, HalfFloatType);
+TYPE_FACTORY(float32, FloatType);
+TYPE_FACTORY(float64, DoubleType);
+TYPE_FACTORY(utf8, StringType);
+TYPE_FACTORY(binary, BinaryType);
+TYPE_FACTORY(date, DateType);
+
+std::shared_ptr<DataType> timestamp(TimeUnit unit) {
+ static std::shared_ptr<DataType> result = std::make_shared<TimestampType>();
+ return result;
+}
+
+std::shared_ptr<DataType> time(TimeUnit unit) {
+ static std::shared_ptr<DataType> result = std::make_shared<TimeType>();
+ return result;
+}
+
+std::shared_ptr<DataType> list(const std::shared_ptr<DataType>& value_type) {
+ return std::make_shared<ListType>(value_type);
+}
+
+std::shared_ptr<DataType> list(const std::shared_ptr<Field>& value_field) {
+ return std::make_shared<ListType>(value_field);
+}
+
+std::shared_ptr<DataType> struct_(const std::vector<std::shared_ptr<Field>>& fields) {
+ return std::make_shared<StructType>(fields);
+}
+
+std::shared_ptr<DataType> ARROW_EXPORT union_(
+ const std::vector<std::shared_ptr<Field>>& child_fields,
+ const std::vector<uint8_t>& type_ids, UnionMode mode) {
+ return std::make_shared<UnionType>(child_fields, type_ids, mode);
+}
+
+std::shared_ptr<Field> field(
+ const std::string& name, const TypePtr& type, bool nullable, int64_t dictionary) {
+ return std::make_shared<Field>(name, type, nullable, dictionary);
+}
+
} // namespace arrow
http://git-wip-us.apache.org/repos/asf/arrow/blob/ed6ec3b7/cpp/src/arrow/type.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h
index ea8516f..5b4d7bc 100644
--- a/cpp/src/arrow/type.h
+++ b/cpp/src/arrow/type.h
@@ -23,7 +23,9 @@
#include <string>
#include <vector>
+#include "arrow/type_fwd.h"
#include "arrow/util/macros.h"
+#include "arrow/util/status.h"
#include "arrow/util/visibility.h"
namespace arrow {
@@ -50,17 +52,20 @@ struct Type {
UINT64 = 8,
INT64 = 9,
+ // 2-byte floating point value
+ HALF_FLOAT = 10,
+
// 4-byte floating point value
- FLOAT = 10,
+ FLOAT = 11,
// 8-byte floating point value
- DOUBLE = 11,
+ DOUBLE = 12,
// UTF8 variable-length string as List<Char>
STRING = 13,
// Variable-length bytes (no guarantee of UTF8-ness)
- BINARY = 15,
+ BINARY = 14,
// By default, int32 days since the UNIX epoch
DATE = 16,
@@ -69,19 +74,16 @@ struct Type {
// Default unit millisecond
TIMESTAMP = 17,
- // Timestamp as double seconds since the UNIX epoch
- TIMESTAMP_DOUBLE = 18,
-
// Exact time encoded with int64, default unit millisecond
- TIME = 19,
+ TIME = 18,
+
+ // YEAR_MONTH or DAY_TIME interval in SQL style
+ INTERVAL = 19,
// Precision- and scale-based decimal type. Storage type depends on the
// parameters.
DECIMAL = 20,
- // Decimal value encoded as a text string
- DECIMAL_TEXT = 21,
-
// A list of some logical data type
LIST = 30,
@@ -89,19 +91,16 @@ struct Type {
STRUCT = 31,
// Unions of logical types
- DENSE_UNION = 32,
- SPARSE_UNION = 33,
+ UNION = 32,
- // Union<Null, Int32, Double, String, Bool>
- JSON_SCALAR = 50,
+ // Timestamp as double seconds since the UNIX epoch
+ TIMESTAMP_DOUBLE = 33,
- // User-defined type
- USER = 60
+ // Decimal value encoded as a text string
+ DECIMAL_TEXT = 34,
};
};
-struct Field;
-
struct ARROW_EXPORT DataType {
Type::type type;
@@ -123,15 +122,32 @@ struct ARROW_EXPORT DataType {
const std::shared_ptr<Field>& child(int i) const { return children_[i]; }
+ const std::vector<std::shared_ptr<Field>>& children() const { return children_; }
+
int num_children() const { return children_.size(); }
- virtual int value_size() const { return -1; }
+ virtual Status Accept(TypeVisitor* visitor) const = 0;
virtual std::string ToString() const = 0;
};
typedef std::shared_ptr<DataType> TypePtr;
+struct ARROW_EXPORT FixedWidthMeta {
+ virtual int bit_width() const = 0;
+};
+
+struct ARROW_EXPORT IntegerMeta {
+ virtual bool is_signed() const = 0;
+};
+
+struct ARROW_EXPORT FloatingPointMeta {
+ enum Precision { HALF, SINGLE, DOUBLE };
+ virtual Precision precision() const = 0;
+};
+
+struct NoExtraMeta {};
+
// A field is a piece of metadata that includes (for now) a name and a data
// type
struct ARROW_EXPORT Field {
@@ -139,7 +155,7 @@ struct ARROW_EXPORT Field {
std::string name;
// The field's data type
- TypePtr type;
+ std::shared_ptr<DataType> type;
// Fields can be nullable
bool nullable;
@@ -148,8 +164,8 @@ struct ARROW_EXPORT Field {
// 0 means it's not dictionary encoded
int64_t dictionary;
- Field(const std::string& name, const TypePtr& type, bool nullable = true,
- int64_t dictionary = 0)
+ Field(const std::string& name, const std::shared_ptr<DataType>& type,
+ bool nullable = true, int64_t dictionary = 0)
: name(name), type(type), nullable(nullable), dictionary(dictionary) {}
bool operator==(const Field& other) const { return this->Equals(other); }
@@ -168,78 +184,112 @@ struct ARROW_EXPORT Field {
};
typedef std::shared_ptr<Field> FieldPtr;
-template <typename Derived>
-struct ARROW_EXPORT PrimitiveType : public DataType {
- PrimitiveType() : DataType(Derived::type_enum) {}
+struct PrimitiveCType : public DataType {
+ using DataType::DataType;
+};
+
+template <typename DERIVED, Type::type TYPE_ID, typename C_TYPE>
+struct ARROW_EXPORT CTypeImpl : public PrimitiveCType, public FixedWidthMeta {
+ using c_type = C_TYPE;
+ static constexpr Type::type type_id = TYPE_ID;
+
+ CTypeImpl() : PrimitiveCType(TYPE_ID) {}
+ int bit_width() const override { return sizeof(C_TYPE) * 8; }
+
+ Status Accept(TypeVisitor* visitor) const override {
+ return visitor->Visit(*static_cast<const DERIVED*>(this));
+ }
+
+ std::string ToString() const override { return std::string(DERIVED::name()); }
+};
+
+struct ARROW_EXPORT NullType : public DataType, public FixedWidthMeta {
+ static constexpr Type::type type_id = Type::NA;
+
+ NullType() : DataType(Type::NA) {}
+
+ int bit_width() const override;
+ Status Accept(TypeVisitor* visitor) const override;
std::string ToString() const override;
+
+ static std::string name() { return "null"; }
+};
+
+template <typename DERIVED, Type::type TYPE_ID, typename C_TYPE>
+struct IntegerTypeImpl : public CTypeImpl<DERIVED, TYPE_ID, C_TYPE>, public IntegerMeta {
+ bool is_signed() const override { return std::is_signed<C_TYPE>::value; }
};
-template <typename Derived>
-inline std::string PrimitiveType<Derived>::ToString() const {
- std::string result(static_cast<const Derived*>(this)->name());
- return result;
-}
+struct ARROW_EXPORT BooleanType : public DataType, FixedWidthMeta {
+ static constexpr Type::type type_id = Type::BOOL;
-#define PRIMITIVE_DECL(TYPENAME, C_TYPE, ENUM, SIZE, NAME) \
- typedef C_TYPE c_type; \
- static constexpr Type::type type_enum = Type::ENUM; \
- \
- TYPENAME() : PrimitiveType<TYPENAME>() {} \
- \
- virtual int value_size() const { return SIZE; } \
- \
- static const char* name() { return NAME; }
+ BooleanType() : DataType(Type::BOOL) {}
-struct ARROW_EXPORT NullType : public PrimitiveType<NullType> {
- PRIMITIVE_DECL(NullType, void, NA, 0, "null");
+ Status Accept(TypeVisitor* visitor) const override;
+ std::string ToString() const override;
+
+ int bit_width() const override { return 1; }
+ static std::string name() { return "bool"; }
};
-struct ARROW_EXPORT BooleanType : public PrimitiveType<BooleanType> {
- PRIMITIVE_DECL(BooleanType, uint8_t, BOOL, 1, "bool");
+struct ARROW_EXPORT UInt8Type : public IntegerTypeImpl<UInt8Type, Type::UINT8, uint8_t> {
+ static std::string name() { return "uint8"; }
};
-struct ARROW_EXPORT UInt8Type : public PrimitiveType<UInt8Type> {
- PRIMITIVE_DECL(UInt8Type, uint8_t, UINT8, 1, "uint8");
+struct ARROW_EXPORT Int8Type : public IntegerTypeImpl<Int8Type, Type::INT8, int8_t> {
+ static std::string name() { return "int8"; }
};
-struct ARROW_EXPORT Int8Type : public PrimitiveType<Int8Type> {
- PRIMITIVE_DECL(Int8Type, int8_t, INT8, 1, "int8");
+struct ARROW_EXPORT UInt16Type
+ : public IntegerTypeImpl<UInt16Type, Type::UINT16, uint16_t> {
+ static std::string name() { return "uint16"; }
};
-struct ARROW_EXPORT UInt16Type : public PrimitiveType<UInt16Type> {
- PRIMITIVE_DECL(UInt16Type, uint16_t, UINT16, 2, "uint16");
+struct ARROW_EXPORT Int16Type : public IntegerTypeImpl<Int16Type, Type::INT16, int16_t> {
+ static std::string name() { return "int16"; }
};
-struct ARROW_EXPORT Int16Type : public PrimitiveType<Int16Type> {
- PRIMITIVE_DECL(Int16Type, int16_t, INT16, 2, "int16");
+struct ARROW_EXPORT UInt32Type
+ : public IntegerTypeImpl<UInt32Type, Type::UINT32, uint32_t> {
+ static std::string name() { return "uint32"; }
};
-struct ARROW_EXPORT UInt32Type : public PrimitiveType<UInt32Type> {
- PRIMITIVE_DECL(UInt32Type, uint32_t, UINT32, 4, "uint32");
+struct ARROW_EXPORT Int32Type : public IntegerTypeImpl<Int32Type, Type::INT32, int32_t> {
+ static std::string name() { return "int32"; }
};
-struct ARROW_EXPORT Int32Type : public PrimitiveType<Int32Type> {
- PRIMITIVE_DECL(Int32Type, int32_t, INT32, 4, "int32");
+struct ARROW_EXPORT UInt64Type
+ : public IntegerTypeImpl<UInt64Type, Type::UINT64, uint64_t> {
+ static std::string name() { return "uint64"; }
};
-struct ARROW_EXPORT UInt64Type : public PrimitiveType<UInt64Type> {
- PRIMITIVE_DECL(UInt64Type, uint64_t, UINT64, 8, "uint64");
+struct ARROW_EXPORT Int64Type : public IntegerTypeImpl<Int64Type, Type::INT64, int64_t> {
+ static std::string name() { return "int64"; }
};
-struct ARROW_EXPORT Int64Type : public PrimitiveType<Int64Type> {
- PRIMITIVE_DECL(Int64Type, int64_t, INT64, 8, "int64");
+struct ARROW_EXPORT HalfFloatType
+ : public CTypeImpl<HalfFloatType, Type::HALF_FLOAT, uint16_t>,
+ public FloatingPointMeta {
+ Precision precision() const override;
+ static std::string name() { return "halffloat"; }
};
-struct ARROW_EXPORT FloatType : public PrimitiveType<FloatType> {
- PRIMITIVE_DECL(FloatType, float, FLOAT, 4, "float");
+struct ARROW_EXPORT FloatType : public CTypeImpl<FloatType, Type::FLOAT, float>,
+ public FloatingPointMeta {
+ Precision precision() const override;
+ static std::string name() { return "float"; }
};
-struct ARROW_EXPORT DoubleType : public PrimitiveType<DoubleType> {
- PRIMITIVE_DECL(DoubleType, double, DOUBLE, 8, "double");
+struct ARROW_EXPORT DoubleType : public CTypeImpl<DoubleType, Type::DOUBLE, double>,
+ public FloatingPointMeta {
+ Precision precision() const override;
+ static std::string name() { return "double"; }
};
-struct ARROW_EXPORT ListType : public DataType {
+struct ARROW_EXPORT ListType : public DataType, public NoExtraMeta {
+ static constexpr Type::type type_id = Type::LIST;
+
// List can contain any other logical value type
explicit ListType(const std::shared_ptr<DataType>& value_type)
: ListType(std::make_shared<Field>("item", value_type)) {}
@@ -252,16 +302,21 @@ struct ARROW_EXPORT ListType : public DataType {
const std::shared_ptr<DataType>& value_type() const { return children_[0]->type; }
- static char const* name() { return "list"; }
-
+ Status Accept(TypeVisitor* visitor) const override;
std::string ToString() const override;
+
+ static std::string name() { return "list"; }
};
// BinaryType type is reprsents lists of 1-byte values.
-struct ARROW_EXPORT BinaryType : public DataType {
+struct ARROW_EXPORT BinaryType : public DataType, public NoExtraMeta {
+ static constexpr Type::type type_id = Type::BINARY;
+
BinaryType() : BinaryType(Type::BINARY) {}
- static char const* name() { return "binary"; }
+
+ Status Accept(TypeVisitor* visitor) const override;
std::string ToString() const override;
+ static std::string name() { return "binary"; }
protected:
// Allow subclasses to change the logical type.
@@ -270,25 +325,160 @@ struct ARROW_EXPORT BinaryType : public DataType {
// UTF encoded strings
struct ARROW_EXPORT StringType : public BinaryType {
- StringType() : BinaryType(Type::STRING) {}
+ static constexpr Type::type type_id = Type::STRING;
- static char const* name() { return "string"; }
+ StringType() : BinaryType(Type::STRING) {}
+ Status Accept(TypeVisitor* visitor) const override;
std::string ToString() const override;
+ static std::string name() { return "utf8"; }
};
-struct ARROW_EXPORT StructType : public DataType {
+struct ARROW_EXPORT StructType : public DataType, public NoExtraMeta {
+ static constexpr Type::type type_id = Type::STRUCT;
+
explicit StructType(const std::vector<std::shared_ptr<Field>>& fields)
: DataType(Type::STRUCT) {
children_ = fields;
}
+ Status Accept(TypeVisitor* visitor) const override;
std::string ToString() const override;
+ static std::string name() { return "struct"; }
+};
+
+struct ARROW_EXPORT DecimalType : public DataType {
+ static constexpr Type::type type_id = Type::DECIMAL;
+
+ explicit DecimalType(int precision_, int scale_)
+ : DataType(Type::DECIMAL), precision(precision_), scale(scale_) {}
+ int precision;
+ int scale;
+
+ Status Accept(TypeVisitor* visitor) const override;
+ std::string ToString() const override;
+ static std::string name() { return "decimal"; }
+};
+
+enum class UnionMode : char { SPARSE, DENSE };
+
+struct ARROW_EXPORT UnionType : public DataType {
+ static constexpr Type::type type_id = Type::UNION;
+
+ UnionType(const std::vector<std::shared_ptr<Field>>& child_fields,
+ const std::vector<uint8_t>& type_ids, UnionMode mode = UnionMode::SPARSE)
+ : DataType(Type::UNION), mode(mode), type_ids(type_ids) {
+ children_ = child_fields;
+ }
+
+ std::string ToString() const override;
+ static std::string name() { return "union"; }
+ Status Accept(TypeVisitor* visitor) const override;
+
+ UnionMode mode;
+ std::vector<uint8_t> type_ids;
+};
+
+struct ARROW_EXPORT DateType : public DataType, public NoExtraMeta {
+ static constexpr Type::type type_id = Type::DATE;
+
+ DateType() : DataType(Type::DATE) {}
+
+ Status Accept(TypeVisitor* visitor) const override;
+ std::string ToString() const override { return name(); }
+ static std::string name() { return "date"; }
+};
+
+enum class TimeUnit : char { SECOND = 0, MILLI = 1, MICRO = 2, NANO = 3 };
+
+struct ARROW_EXPORT TimeType : public DataType {
+ static constexpr Type::type type_id = Type::TIME;
+ using Unit = TimeUnit;
+
+ TimeUnit unit;
+
+ explicit TimeType(TimeUnit unit = TimeUnit::MILLI) : DataType(Type::TIME), unit(unit) {}
+ TimeType(const TimeType& other) : TimeType(other.unit) {}
+
+ Status Accept(TypeVisitor* visitor) const override;
+ std::string ToString() const override { return name(); }
+ static std::string name() { return "time"; }
+};
+
+struct ARROW_EXPORT TimestampType : public DataType, public FixedWidthMeta {
+ using Unit = TimeUnit;
+
+ typedef int64_t c_type;
+ static constexpr Type::type type_id = Type::TIMESTAMP;
+
+ int bit_width() const override { return sizeof(int64_t) * 8; }
+
+ TimeUnit unit;
+
+ explicit TimestampType(TimeUnit unit = TimeUnit::MILLI)
+ : DataType(Type::TIMESTAMP), unit(unit) {}
+
+ TimestampType(const TimestampType& other) : TimestampType(other.unit) {}
+
+ Status Accept(TypeVisitor* visitor) const override;
+ std::string ToString() const override { return name(); }
+ static std::string name() { return "timestamp"; }
+};
+
+struct ARROW_EXPORT IntervalType : public DataType, public FixedWidthMeta {
+ enum class Unit : char { YEAR_MONTH = 0, DAY_TIME = 1 };
+
+ typedef int64_t c_type;
+ static constexpr Type::type type_id = Type::INTERVAL;
+
+ int bit_width() const override { return sizeof(int64_t) * 8; }
+
+ Unit unit;
+
+ explicit IntervalType(Unit unit = Unit::YEAR_MONTH)
+ : DataType(Type::INTERVAL), unit(unit) {}
+
+ IntervalType(const IntervalType& other) : IntervalType(other.unit) {}
+
+ Status Accept(TypeVisitor* visitor) const override;
+ std::string ToString() const override { return name(); }
+ static std::string name() { return "date"; }
};
-// These will be defined elsewhere
-template <typename T>
-struct TypeTraits {};
+// Factory functions
+
+std::shared_ptr<DataType> ARROW_EXPORT null();
+std::shared_ptr<DataType> ARROW_EXPORT boolean();
+std::shared_ptr<DataType> ARROW_EXPORT int8();
+std::shared_ptr<DataType> ARROW_EXPORT int16();
+std::shared_ptr<DataType> ARROW_EXPORT int32();
+std::shared_ptr<DataType> ARROW_EXPORT int64();
+std::shared_ptr<DataType> ARROW_EXPORT uint8();
+std::shared_ptr<DataType> ARROW_EXPORT uint16();
+std::shared_ptr<DataType> ARROW_EXPORT uint32();
+std::shared_ptr<DataType> ARROW_EXPORT uint64();
+std::shared_ptr<DataType> ARROW_EXPORT float16();
+std::shared_ptr<DataType> ARROW_EXPORT float32();
+std::shared_ptr<DataType> ARROW_EXPORT float64();
+std::shared_ptr<DataType> ARROW_EXPORT utf8();
+std::shared_ptr<DataType> ARROW_EXPORT binary();
+
+std::shared_ptr<DataType> ARROW_EXPORT list(const std::shared_ptr<Field>& value_type);
+std::shared_ptr<DataType> ARROW_EXPORT list(const std::shared_ptr<DataType>& value_type);
+
+std::shared_ptr<DataType> ARROW_EXPORT date();
+std::shared_ptr<DataType> ARROW_EXPORT timestamp(TimeUnit unit);
+std::shared_ptr<DataType> ARROW_EXPORT time(TimeUnit unit);
+
+std::shared_ptr<DataType> ARROW_EXPORT struct_(
+ const std::vector<std::shared_ptr<Field>>& fields);
+
+std::shared_ptr<DataType> ARROW_EXPORT union_(
+ const std::vector<std::shared_ptr<Field>>& child_fields,
+ const std::vector<uint8_t>& type_ids, UnionMode mode = UnionMode::SPARSE);
+
+std::shared_ptr<Field> ARROW_EXPORT field(const std::string& name,
+ const std::shared_ptr<DataType>& type, bool nullable = true, int64_t dictionary = 0);
} // namespace arrow
http://git-wip-us.apache.org/repos/asf/arrow/blob/ed6ec3b7/cpp/src/arrow/type_fwd.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h
new file mode 100644
index 0000000..6d660f4
--- /dev/null
+++ b/cpp/src/arrow/type_fwd.h
@@ -0,0 +1,157 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef ARROW_TYPE_FWD_H
+#define ARROW_TYPE_FWD_H
+
+namespace arrow {
+
+class Status;
+
+struct DataType;
+class Array;
+class ArrayBuilder;
+struct Field;
+
+class Buffer;
+class MemoryPool;
+class RecordBatch;
+class Schema;
+
+struct NullType;
+class NullArray;
+
+struct BooleanType;
+class BooleanArray;
+class BooleanBuilder;
+
+struct BinaryType;
+class BinaryArray;
+class BinaryBuilder;
+
+struct StringType;
+class StringArray;
+class StringBuilder;
+
+struct ListType;
+class ListArray;
+class ListBuilder;
+
+struct StructType;
+class StructArray;
+class StructBuilder;
+
+struct DecimalType;
+class DecimalArray;
+
+struct UnionType;
+class UnionArray;
+
+template <typename TypeClass>
+class NumericArray;
+
+template <typename TypeClass>
+class NumericBuilder;
+
+#define _NUMERIC_TYPE_DECL(KLASS) \
+ struct KLASS##Type; \
+ using KLASS##Array = NumericArray<KLASS##Type>; \
+ using KLASS##Builder = NumericBuilder<KLASS##Type>;
+
+_NUMERIC_TYPE_DECL(Int8);
+_NUMERIC_TYPE_DECL(Int16);
+_NUMERIC_TYPE_DECL(Int32);
+_NUMERIC_TYPE_DECL(Int64);
+_NUMERIC_TYPE_DECL(UInt8);
+_NUMERIC_TYPE_DECL(UInt16);
+_NUMERIC_TYPE_DECL(UInt32);
+_NUMERIC_TYPE_DECL(UInt64);
+_NUMERIC_TYPE_DECL(HalfFloat);
+_NUMERIC_TYPE_DECL(Float);
+_NUMERIC_TYPE_DECL(Double);
+
+#undef _NUMERIC_TYPE_DECL
+
+struct DateType;
+class DateArray;
+
+struct TimeType;
+class TimeArray;
+
+struct TimestampType;
+using TimestampArray = NumericArray<TimestampType>;
+
+struct IntervalType;
+using IntervalArray = NumericArray<IntervalType>;
+
+class TypeVisitor {
+ public:
+ virtual Status Visit(const NullType& type) = 0;
+ virtual Status Visit(const BooleanType& type) = 0;
+ virtual Status Visit(const Int8Type& type) = 0;
+ virtual Status Visit(const Int16Type& type) = 0;
+ virtual Status Visit(const Int32Type& type) = 0;
+ virtual Status Visit(const Int64Type& type) = 0;
+ virtual Status Visit(const UInt8Type& type) = 0;
+ virtual Status Visit(const UInt16Type& type) = 0;
+ virtual Status Visit(const UInt32Type& type) = 0;
+ virtual Status Visit(const UInt64Type& type) = 0;
+ virtual Status Visit(const HalfFloatType& type) = 0;
+ virtual Status Visit(const FloatType& type) = 0;
+ virtual Status Visit(const DoubleType& type) = 0;
+ virtual Status Visit(const StringType& type) = 0;
+ virtual Status Visit(const BinaryType& type) = 0;
+ virtual Status Visit(const DateType& type) = 0;
+ virtual Status Visit(const TimeType& type) = 0;
+ virtual Status Visit(const TimestampType& type) = 0;
+ virtual Status Visit(const IntervalType& type) = 0;
+ virtual Status Visit(const DecimalType& type) = 0;
+ virtual Status Visit(const ListType& type) = 0;
+ virtual Status Visit(const StructType& type) = 0;
+ virtual Status Visit(const UnionType& type) = 0;
+};
+
+class ArrayVisitor {
+ public:
+ virtual Status Visit(const NullArray& array) = 0;
+ virtual Status Visit(const BooleanArray& array) = 0;
+ virtual Status Visit(const Int8Array& array) = 0;
+ virtual Status Visit(const Int16Array& array) = 0;
+ virtual Status Visit(const Int32Array& array) = 0;
+ virtual Status Visit(const Int64Array& array) = 0;
+ virtual Status Visit(const UInt8Array& array) = 0;
+ virtual Status Visit(const UInt16Array& array) = 0;
+ virtual Status Visit(const UInt32Array& array) = 0;
+ virtual Status Visit(const UInt64Array& array) = 0;
+ virtual Status Visit(const HalfFloatArray& array) = 0;
+ virtual Status Visit(const FloatArray& array) = 0;
+ virtual Status Visit(const DoubleArray& array) = 0;
+ virtual Status Visit(const StringArray& array) = 0;
+ virtual Status Visit(const BinaryArray& array) = 0;
+ virtual Status Visit(const DateArray& array) = 0;
+ virtual Status Visit(const TimeArray& array) = 0;
+ virtual Status Visit(const TimestampArray& array) = 0;
+ virtual Status Visit(const IntervalArray& array) = 0;
+ virtual Status Visit(const DecimalArray& array) = 0;
+ virtual Status Visit(const ListArray& array) = 0;
+ virtual Status Visit(const StructArray& array) = 0;
+ virtual Status Visit(const UnionArray& array) = 0;
+};
+
+} // namespace arrow
+
+#endif // ARROW_TYPE_FWD_H
http://git-wip-us.apache.org/repos/asf/arrow/blob/ed6ec3b7/cpp/src/arrow/type_traits.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h
new file mode 100644
index 0000000..bbb8074
--- /dev/null
+++ b/cpp/src/arrow/type_traits.h
@@ -0,0 +1,197 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef ARROW_TYPE_TRAITS_H
+#define ARROW_TYPE_TRAITS_H
+
+#include <type_traits>
+
+#include "arrow/type_fwd.h"
+#include "arrow/util/bit-util.h"
+
+namespace arrow {
+
+template <typename T>
+struct TypeTraits {};
+
+template <>
+struct TypeTraits<UInt8Type> {
+ using ArrayType = UInt8Array;
+ using BuilderType = UInt8Builder;
+ static inline int bytes_required(int elements) { return elements; }
+};
+
+template <>
+struct TypeTraits<Int8Type> {
+ using ArrayType = Int8Array;
+ using BuilderType = Int8Builder;
+ static inline int bytes_required(int elements) { return elements; }
+};
+
+template <>
+struct TypeTraits<UInt16Type> {
+ using ArrayType = UInt16Array;
+ using BuilderType = UInt16Builder;
+
+ static inline int bytes_required(int elements) { return elements * sizeof(uint16_t); }
+};
+
+template <>
+struct TypeTraits<Int16Type> {
+ using ArrayType = Int16Array;
+ using BuilderType = Int16Builder;
+
+ static inline int bytes_required(int elements) { return elements * sizeof(int16_t); }
+};
+
+template <>
+struct TypeTraits<UInt32Type> {
+ using ArrayType = UInt32Array;
+ using BuilderType = UInt32Builder;
+
+ static inline int bytes_required(int elements) { return elements * sizeof(uint32_t); }
+};
+
+template <>
+struct TypeTraits<Int32Type> {
+ using ArrayType = Int32Array;
+ using BuilderType = Int32Builder;
+
+ static inline int bytes_required(int elements) { return elements * sizeof(int32_t); }
+};
+
+template <>
+struct TypeTraits<UInt64Type> {
+ using ArrayType = UInt64Array;
+ using BuilderType = UInt64Builder;
+
+ static inline int bytes_required(int elements) { return elements * sizeof(uint64_t); }
+};
+
+template <>
+struct TypeTraits<Int64Type> {
+ using ArrayType = Int64Array;
+ using BuilderType = Int64Builder;
+
+ static inline int bytes_required(int elements) { return elements * sizeof(int64_t); }
+};
+
+template <>
+struct TypeTraits<TimestampType> {
+ using ArrayType = TimestampArray;
+ // using BuilderType = TimestampBuilder;
+
+ static inline int bytes_required(int elements) { return elements * sizeof(int64_t); }
+};
+
+template <>
+struct TypeTraits<HalfFloatType> {
+ using ArrayType = HalfFloatArray;
+ using BuilderType = HalfFloatBuilder;
+
+ static inline int bytes_required(int elements) { return elements * sizeof(uint16_t); }
+};
+
+template <>
+struct TypeTraits<FloatType> {
+ using ArrayType = FloatArray;
+ using BuilderType = FloatBuilder;
+
+ static inline int bytes_required(int elements) { return elements * sizeof(float); }
+};
+
+template <>
+struct TypeTraits<DoubleType> {
+ using ArrayType = DoubleArray;
+ using BuilderType = DoubleBuilder;
+
+ static inline int bytes_required(int elements) { return elements * sizeof(double); }
+};
+
+template <>
+struct TypeTraits<BooleanType> {
+ using ArrayType = BooleanArray;
+ using BuilderType = BooleanBuilder;
+
+ static inline int bytes_required(int elements) {
+ return BitUtil::BytesForBits(elements);
+ }
+};
+
+template <>
+struct TypeTraits<StringType> {
+ using ArrayType = StringArray;
+ using BuilderType = StringBuilder;
+};
+
+template <>
+struct TypeTraits<BinaryType> {
+ using ArrayType = BinaryArray;
+ using BuilderType = BinaryBuilder;
+};
+
+// Not all type classes have a c_type
+template <typename T>
+struct as_void {
+ using type = void;
+};
+
+// The partial specialization will match if T has the ATTR_NAME member
+#define GET_ATTR(ATTR_NAME, DEFAULT) \
+ template <typename T, typename Enable = void> \
+ struct GetAttr_##ATTR_NAME { \
+ using type = DEFAULT; \
+ }; \
+ \
+ template <typename T> \
+ struct GetAttr_##ATTR_NAME<T, typename as_void<typename T::ATTR_NAME>::type> { \
+ using type = typename T::ATTR_NAME; \
+ };
+
+GET_ATTR(c_type, void);
+GET_ATTR(TypeClass, void);
+
+#undef GET_ATTR
+
+#define PRIMITIVE_TRAITS(T) \
+ using TypeClass = typename std::conditional<std::is_base_of<DataType, T>::value, T, \
+ typename GetAttr_TypeClass<T>::type>::type; \
+ using c_type = typename GetAttr_c_type<TypeClass>::type;
+
+template <typename T>
+struct IsUnsignedInt {
+ PRIMITIVE_TRAITS(T);
+ static constexpr bool value =
+ std::is_integral<c_type>::value && std::is_unsigned<c_type>::value;
+};
+
+template <typename T>
+struct IsSignedInt {
+ PRIMITIVE_TRAITS(T);
+ static constexpr bool value =
+ std::is_integral<c_type>::value && std::is_signed<c_type>::value;
+};
+
+template <typename T>
+struct IsFloatingPoint {
+ PRIMITIVE_TRAITS(T);
+ static constexpr bool value = std::is_floating_point<c_type>::value;
+};
+
+} // namespace arrow
+
+#endif // ARROW_TYPE_TRAITS_H
http://git-wip-us.apache.org/repos/asf/arrow/blob/ed6ec3b7/cpp/src/arrow/types/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/CMakeLists.txt b/cpp/src/arrow/types/CMakeLists.txt
index 9f78169..6d59acf 100644
--- a/cpp/src/arrow/types/CMakeLists.txt
+++ b/cpp/src/arrow/types/CMakeLists.txt
@@ -21,7 +21,6 @@
# Headers: top level
install(FILES
- collection.h
construct.h
datetime.h
decimal.h
http://git-wip-us.apache.org/repos/asf/arrow/blob/ed6ec3b7/cpp/src/arrow/types/collection.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/collection.h b/cpp/src/arrow/types/collection.h
deleted file mode 100644
index 1712030..0000000
--- a/cpp/src/arrow/types/collection.h
+++ /dev/null
@@ -1,41 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#ifndef ARROW_TYPES_COLLECTION_H
-#define ARROW_TYPES_COLLECTION_H
-
-#include <string>
-#include <vector>
-
-#include "arrow/type.h"
-
-namespace arrow {
-
-template <Type::type T>
-struct CollectionType : public DataType {
- std::vector<TypePtr> child_types_;
-
- CollectionType() : DataType(T) {}
-
- const TypePtr& child(int i) const { return child_types_[i]; }
-
- int num_children() const { return child_types_.size(); }
-};
-
-} // namespace arrow
-
-#endif // ARROW_TYPES_COLLECTION_H
http://git-wip-us.apache.org/repos/asf/arrow/blob/ed6ec3b7/cpp/src/arrow/types/datetime.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/datetime.h b/cpp/src/arrow/types/datetime.h
index 241a126..a8f8639 100644
--- a/cpp/src/arrow/types/datetime.h
+++ b/cpp/src/arrow/types/datetime.h
@@ -22,41 +22,6 @@
#include "arrow/type.h"
-namespace arrow {
-
-struct DateType : public DataType {
- enum class Unit : char { DAY = 0, MONTH = 1, YEAR = 2 };
-
- Unit unit;
-
- explicit DateType(Unit unit = Unit::DAY) : DataType(Type::DATE), unit(unit) {}
-
- DateType(const DateType& other) : DateType(other.unit) {}
-
- static char const* name() { return "date"; }
-};
-
-struct ARROW_EXPORT TimestampType : public DataType {
- enum class Unit : char { SECOND = 0, MILLI = 1, MICRO = 2, NANO = 3 };
-
- typedef int64_t c_type;
- static constexpr Type::type type_enum = Type::TIMESTAMP;
-
- int value_size() const override { return sizeof(int64_t); }
-
- Unit unit;
-
- explicit TimestampType(Unit unit = Unit::MILLI)
- : DataType(Type::TIMESTAMP), unit(unit) {}
-
- TimestampType(const TimestampType& other) : TimestampType(other.unit) {}
- virtual ~TimestampType() {}
-
- std::string ToString() const override { return "timestamp"; }
-
- static char const* name() { return "timestamp"; }
-};
-
-} // namespace arrow
+namespace arrow {} // namespace arrow
#endif // ARROW_TYPES_DATETIME_H
http://git-wip-us.apache.org/repos/asf/arrow/blob/ed6ec3b7/cpp/src/arrow/types/decimal.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/decimal.h b/cpp/src/arrow/types/decimal.h
index 6c497c5..b3ea3a5 100644
--- a/cpp/src/arrow/types/decimal.h
+++ b/cpp/src/arrow/types/decimal.h
@@ -23,18 +23,6 @@
#include "arrow/type.h"
#include "arrow/util/visibility.h"
-namespace arrow {
-
-struct ARROW_EXPORT DecimalType : public DataType {
- explicit DecimalType(int precision_, int scale_)
- : DataType(Type::DECIMAL), precision(precision_), scale(scale_) {}
- int precision;
- int scale;
- static char const* name() { return "decimal"; }
-
- std::string ToString() const override;
-};
-
-} // namespace arrow
+namespace arrow {} // namespace arrow
#endif // ARROW_TYPES_DECIMAL_H
http://git-wip-us.apache.org/repos/asf/arrow/blob/ed6ec3b7/cpp/src/arrow/types/list-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/list-test.cc b/cpp/src/arrow/types/list-test.cc
index 12c5394..cb9a8c1 100644
--- a/cpp/src/arrow/types/list-test.cc
+++ b/cpp/src/arrow/types/list-test.cc
@@ -141,7 +141,7 @@ TEST_F(TestListBuilder, TestAppendNull) {
ASSERT_TRUE(result_->IsNull(0));
ASSERT_TRUE(result_->IsNull(1));
- ASSERT_EQ(0, result_->offsets()[0]);
+ ASSERT_EQ(0, result_->raw_offsets()[0]);
ASSERT_EQ(0, result_->offset(1));
ASSERT_EQ(0, result_->offset(2));
http://git-wip-us.apache.org/repos/asf/arrow/blob/ed6ec3b7/cpp/src/arrow/types/list.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/list.cc b/cpp/src/arrow/types/list.cc
index 4b1e821..d865632 100644
--- a/cpp/src/arrow/types/list.cc
+++ b/cpp/src/arrow/types/list.cc
@@ -155,4 +155,8 @@ void ListBuilder::Reset() {
null_bitmap_ = nullptr;
}
+Status ListArray::Accept(ArrayVisitor* visitor) const {
+ return visitor->Visit(*this);
+}
+
} // namespace arrow
http://git-wip-us.apache.org/repos/asf/arrow/blob/ed6ec3b7/cpp/src/arrow/types/list.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/list.h b/cpp/src/arrow/types/list.h
index 9440ffe..bd93e8f 100644
--- a/cpp/src/arrow/types/list.h
+++ b/cpp/src/arrow/types/list.h
@@ -39,6 +39,8 @@ class MemoryPool;
class ARROW_EXPORT ListArray : public Array {
public:
+ using TypeClass = ListType;
+
ListArray(const TypePtr& type, int32_t length, std::shared_ptr<Buffer> offsets,
const ArrayPtr& values, int32_t null_count = 0,
std::shared_ptr<Buffer> null_bitmap = nullptr)
@@ -56,13 +58,13 @@ class ARROW_EXPORT ListArray : public Array {
// Return a shared pointer in case the requestor desires to share ownership
// with this array.
const std::shared_ptr<Array>& values() const { return values_; }
- const std::shared_ptr<Buffer> offset_buffer() const {
+ std::shared_ptr<Buffer> offsets() const {
return std::static_pointer_cast<Buffer>(offset_buffer_);
}
const std::shared_ptr<DataType>& value_type() const { return values_->type(); }
- const int32_t* offsets() const { return offsets_; }
+ const int32_t* raw_offsets() const { return offsets_; }
int32_t offset(int i) const { return offsets_[i]; }
@@ -76,6 +78,8 @@ class ARROW_EXPORT ListArray : public Array {
bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx,
const ArrayPtr& arr) const override;
+ Status Accept(ArrayVisitor* visitor) const override;
+
protected:
std::shared_ptr<Buffer> offset_buffer_;
const int32_t* offsets_;
http://git-wip-us.apache.org/repos/asf/arrow/blob/ed6ec3b7/cpp/src/arrow/types/primitive-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/primitive-test.cc b/cpp/src/arrow/types/primitive-test.cc
index e47f6dc..bdc8ec0 100644
--- a/cpp/src/arrow/types/primitive-test.cc
+++ b/cpp/src/arrow/types/primitive-test.cc
@@ -25,6 +25,7 @@
#include "arrow/builder.h"
#include "arrow/test-util.h"
#include "arrow/type.h"
+#include "arrow/type_traits.h"
#include "arrow/types/construct.h"
#include "arrow/types/primitive.h"
#include "arrow/types/test-common.h"
@@ -41,15 +42,15 @@ namespace arrow {
class Array;
-#define PRIMITIVE_TEST(KLASS, ENUM, NAME) \
- TEST(TypesTest, TestPrimitive_##ENUM) { \
- KLASS tp; \
- \
- ASSERT_EQ(tp.type, Type::ENUM); \
- ASSERT_EQ(tp.name(), string(NAME)); \
- \
- KLASS tp_copy = tp; \
- ASSERT_EQ(tp_copy.type, Type::ENUM); \
+#define PRIMITIVE_TEST(KLASS, ENUM, NAME) \
+ TEST(TypesTest, TestPrimitive_##ENUM) { \
+ KLASS tp; \
+ \
+ ASSERT_EQ(tp.type, Type::ENUM); \
+ ASSERT_EQ(tp.ToString(), string(NAME)); \
+ \
+ KLASS tp_copy = tp; \
+ ASSERT_EQ(tp_copy.type, Type::ENUM); \
}
PRIMITIVE_TEST(Int8Type, INT8, "int8");
@@ -243,7 +244,8 @@ void TestPrimitiveBuilder<PBoolean>::Check(
}
typedef ::testing::Types<PBoolean, PUInt8, PUInt16, PUInt32, PUInt64, PInt8, PInt16,
- PInt32, PInt64, PFloat, PDouble> Primitives;
+ PInt32, PInt64, PFloat, PDouble>
+ Primitives;
TYPED_TEST_CASE(TestPrimitiveBuilder, Primitives);
@@ -311,20 +313,6 @@ TYPED_TEST(TestPrimitiveBuilder, TestArrayDtorDealloc) {
ASSERT_EQ(memory_before, this->pool_->bytes_allocated());
}
-template <class T, class Builder>
-Status MakeArray(const vector<uint8_t>& valid_bytes, const vector<T>& draws, int size,
- Builder* builder, ArrayPtr* out) {
- // Append the first 1000
- for (int i = 0; i < size; ++i) {
- if (valid_bytes[i] > 0) {
- RETURN_NOT_OK(builder->Append(draws[i]));
- } else {
- RETURN_NOT_OK(builder->AppendNull());
- }
- }
- return builder->Finish(out);
-}
-
TYPED_TEST(TestPrimitiveBuilder, Equality) {
DECL_T();
http://git-wip-us.apache.org/repos/asf/arrow/blob/ed6ec3b7/cpp/src/arrow/types/primitive.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/primitive.cc b/cpp/src/arrow/types/primitive.cc
index d2288ba..14667ee 100644
--- a/cpp/src/arrow/types/primitive.cc
+++ b/cpp/src/arrow/types/primitive.cc
@@ -19,6 +19,7 @@
#include <memory>
+#include "arrow/type_traits.h"
#include "arrow/util/bit-util.h"
#include "arrow/util/buffer.h"
#include "arrow/util/logging.h"
@@ -48,13 +49,14 @@ bool PrimitiveArray::EqualsExact(const PrimitiveArray& other) const {
const uint8_t* this_data = raw_data_;
const uint8_t* other_data = other.raw_data_;
- int value_size = type_->value_size();
- DCHECK_GT(value_size, 0);
+ auto size_meta = dynamic_cast<const FixedWidthMeta*>(type_.get());
+ int value_byte_size = size_meta->bit_width() / 8;
+ DCHECK_GT(value_byte_size, 0);
for (int i = 0; i < length_; ++i) {
- if (!IsNull(i) && memcmp(this_data, other_data, value_size)) { return false; }
- this_data += value_size;
- other_data += value_size;
+ if (!IsNull(i) && memcmp(this_data, other_data, value_byte_size)) { return false; }
+ this_data += value_byte_size;
+ other_data += value_byte_size;
}
return true;
} else {
@@ -70,6 +72,11 @@ bool PrimitiveArray::Equals(const std::shared_ptr<Array>& arr) const {
return EqualsExact(*static_cast<const PrimitiveArray*>(arr.get()));
}
+template <typename T>
+Status NumericArray<T>::Accept(ArrayVisitor* visitor) const {
+ return visitor->Visit(*this);
+}
+
template class NumericArray<UInt8Type>;
template class NumericArray<UInt16Type>;
template class NumericArray<UInt32Type>;
@@ -79,9 +86,9 @@ template class NumericArray<Int16Type>;
template class NumericArray<Int32Type>;
template class NumericArray<Int64Type>;
template class NumericArray<TimestampType>;
+template class NumericArray<HalfFloatType>;
template class NumericArray<FloatType>;
template class NumericArray<DoubleType>;
-template class NumericArray<BooleanType>;
template <typename T>
Status PrimitiveBuilder<T>::Init(int32_t capacity) {
@@ -145,8 +152,65 @@ Status PrimitiveBuilder<T>::Finish(std::shared_ptr<Array>* out) {
return Status::OK();
}
-template <>
-Status PrimitiveBuilder<BooleanType>::Append(
+template class PrimitiveBuilder<UInt8Type>;
+template class PrimitiveBuilder<UInt16Type>;
+template class PrimitiveBuilder<UInt32Type>;
+template class PrimitiveBuilder<UInt64Type>;
+template class PrimitiveBuilder<Int8Type>;
+template class PrimitiveBuilder<Int16Type>;
+template class PrimitiveBuilder<Int32Type>;
+template class PrimitiveBuilder<Int64Type>;
+template class PrimitiveBuilder<TimestampType>;
+template class PrimitiveBuilder<HalfFloatType>;
+template class PrimitiveBuilder<FloatType>;
+template class PrimitiveBuilder<DoubleType>;
+
+Status BooleanBuilder::Init(int32_t capacity) {
+ RETURN_NOT_OK(ArrayBuilder::Init(capacity));
+ data_ = std::make_shared<PoolBuffer>(pool_);
+
+ int64_t nbytes = BitUtil::BytesForBits(capacity);
+ RETURN_NOT_OK(data_->Resize(nbytes));
+ // TODO(emkornfield) valgrind complains without this
+ memset(data_->mutable_data(), 0, nbytes);
+
+ raw_data_ = reinterpret_cast<uint8_t*>(data_->mutable_data());
+ return Status::OK();
+}
+
+Status BooleanBuilder::Resize(int32_t capacity) {
+ // XXX: Set floor size for now
+ if (capacity < kMinBuilderCapacity) { capacity = kMinBuilderCapacity; }
+
+ if (capacity_ == 0) {
+ RETURN_NOT_OK(Init(capacity));
+ } else {
+ RETURN_NOT_OK(ArrayBuilder::Resize(capacity));
+ const int64_t old_bytes = data_->size();
+ const int64_t new_bytes = BitUtil::BytesForBits(capacity);
+
+ RETURN_NOT_OK(data_->Resize(new_bytes));
+ raw_data_ = reinterpret_cast<uint8_t*>(data_->mutable_data());
+ memset(data_->mutable_data() + old_bytes, 0, new_bytes - old_bytes);
+ }
+ return Status::OK();
+}
+
+Status BooleanBuilder::Finish(std::shared_ptr<Array>* out) {
+ const int64_t bytes_required = BitUtil::BytesForBits(length_);
+
+ if (bytes_required > 0 && bytes_required < data_->size()) {
+ // Trim buffers
+ RETURN_NOT_OK(data_->Resize(bytes_required));
+ }
+ *out = std::make_shared<BooleanArray>(type_, length_, data_, null_count_, null_bitmap_);
+
+ data_ = null_bitmap_ = nullptr;
+ capacity_ = length_ = null_count_ = 0;
+ return Status::OK();
+}
+
+Status BooleanBuilder::Append(
const uint8_t* values, int32_t length, const uint8_t* valid_bytes) {
RETURN_NOT_OK(Reserve(length));
@@ -168,19 +232,6 @@ Status PrimitiveBuilder<BooleanType>::Append(
return Status::OK();
}
-template class PrimitiveBuilder<UInt8Type>;
-template class PrimitiveBuilder<UInt16Type>;
-template class PrimitiveBuilder<UInt32Type>;
-template class PrimitiveBuilder<UInt64Type>;
-template class PrimitiveBuilder<Int8Type>;
-template class PrimitiveBuilder<Int16Type>;
-template class PrimitiveBuilder<Int32Type>;
-template class PrimitiveBuilder<Int64Type>;
-template class PrimitiveBuilder<TimestampType>;
-template class PrimitiveBuilder<FloatType>;
-template class PrimitiveBuilder<DoubleType>;
-template class PrimitiveBuilder<BooleanType>;
-
BooleanArray::BooleanArray(int32_t length, const std::shared_ptr<Buffer>& data,
int32_t null_count, const std::shared_ptr<Buffer>& null_bitmap)
: PrimitiveArray(
@@ -235,4 +286,8 @@ bool BooleanArray::RangeEquals(int32_t start_idx, int32_t end_idx,
return true;
}
+Status BooleanArray::Accept(ArrayVisitor* visitor) const {
+ return visitor->Visit(*this);
+}
+
} // namespace arrow
http://git-wip-us.apache.org/repos/asf/arrow/blob/ed6ec3b7/cpp/src/arrow/types/primitive.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/primitive.h b/cpp/src/arrow/types/primitive.h
index c71df58..a5a3704 100644
--- a/cpp/src/arrow/types/primitive.h
+++ b/cpp/src/arrow/types/primitive.h
@@ -26,6 +26,7 @@
#include "arrow/array.h"
#include "arrow/builder.h"
#include "arrow/type.h"
+#include "arrow/type_fwd.h"
#include "arrow/types/datetime.h"
#include "arrow/util/bit-util.h"
#include "arrow/util/buffer.h"
@@ -54,9 +55,10 @@ class ARROW_EXPORT PrimitiveArray : public Array {
const uint8_t* raw_data_;
};
-template <class TypeClass>
+template <class TYPE>
class ARROW_EXPORT NumericArray : public PrimitiveArray {
public:
+ using TypeClass = TYPE;
using value_type = typename TypeClass::c_type;
NumericArray(int32_t length, const std::shared_ptr<Buffer>& data,
int32_t null_count = 0, const std::shared_ptr<Buffer>& null_bitmap = nullptr)
@@ -88,29 +90,15 @@ class ARROW_EXPORT NumericArray : public PrimitiveArray {
return reinterpret_cast<const value_type*>(raw_data_);
}
+ Status Accept(ArrayVisitor* visitor) const override;
+
value_type Value(int i) const { return raw_data()[i]; }
};
-#define NUMERIC_ARRAY_DECL(NAME, TypeClass) \
- using NAME = NumericArray<TypeClass>; \
- extern template class ARROW_EXPORT NumericArray<TypeClass>;
-
-NUMERIC_ARRAY_DECL(UInt8Array, UInt8Type);
-NUMERIC_ARRAY_DECL(Int8Array, Int8Type);
-NUMERIC_ARRAY_DECL(UInt16Array, UInt16Type);
-NUMERIC_ARRAY_DECL(Int16Array, Int16Type);
-NUMERIC_ARRAY_DECL(UInt32Array, UInt32Type);
-NUMERIC_ARRAY_DECL(Int32Array, Int32Type);
-NUMERIC_ARRAY_DECL(UInt64Array, UInt64Type);
-NUMERIC_ARRAY_DECL(Int64Array, Int64Type);
-NUMERIC_ARRAY_DECL(TimestampArray, TimestampType);
-NUMERIC_ARRAY_DECL(FloatArray, FloatType);
-NUMERIC_ARRAY_DECL(DoubleArray, DoubleType);
-
template <typename Type>
class ARROW_EXPORT PrimitiveBuilder : public ArrayBuilder {
public:
- typedef typename Type::c_type value_type;
+ using value_type = typename Type::c_type;
explicit PrimitiveBuilder(MemoryPool* pool, const TypePtr& type)
: ArrayBuilder(pool, type), data_(nullptr) {}
@@ -183,101 +171,27 @@ class ARROW_EXPORT NumericBuilder : public PrimitiveBuilder<T> {
using PrimitiveBuilder<T>::raw_data_;
};
-template <>
-struct TypeTraits<UInt8Type> {
- typedef UInt8Array ArrayType;
-
- static inline int bytes_required(int elements) { return elements; }
-};
-
-template <>
-struct TypeTraits<Int8Type> {
- typedef Int8Array ArrayType;
-
- static inline int bytes_required(int elements) { return elements; }
-};
-
-template <>
-struct TypeTraits<UInt16Type> {
- typedef UInt16Array ArrayType;
-
- static inline int bytes_required(int elements) { return elements * sizeof(uint16_t); }
-};
-
-template <>
-struct TypeTraits<Int16Type> {
- typedef Int16Array ArrayType;
-
- static inline int bytes_required(int elements) { return elements * sizeof(int16_t); }
-};
-
-template <>
-struct TypeTraits<UInt32Type> {
- typedef UInt32Array ArrayType;
-
- static inline int bytes_required(int elements) { return elements * sizeof(uint32_t); }
-};
-
-template <>
-struct TypeTraits<Int32Type> {
- typedef Int32Array ArrayType;
-
- static inline int bytes_required(int elements) { return elements * sizeof(int32_t); }
-};
-
-template <>
-struct TypeTraits<UInt64Type> {
- typedef UInt64Array ArrayType;
-
- static inline int bytes_required(int elements) { return elements * sizeof(uint64_t); }
-};
-
-template <>
-struct TypeTraits<Int64Type> {
- typedef Int64Array ArrayType;
-
- static inline int bytes_required(int elements) { return elements * sizeof(int64_t); }
-};
-
-template <>
-struct TypeTraits<TimestampType> {
- typedef TimestampArray ArrayType;
-
- static inline int bytes_required(int elements) { return elements * sizeof(int64_t); }
-};
-template <>
-
-struct TypeTraits<FloatType> {
- typedef FloatArray ArrayType;
-
- static inline int bytes_required(int elements) { return elements * sizeof(float); }
-};
-
-template <>
-struct TypeTraits<DoubleType> {
- typedef DoubleArray ArrayType;
-
- static inline int bytes_required(int elements) { return elements * sizeof(double); }
-};
-
// Builders
-typedef NumericBuilder<UInt8Type> UInt8Builder;
-typedef NumericBuilder<UInt16Type> UInt16Builder;
-typedef NumericBuilder<UInt32Type> UInt32Builder;
-typedef NumericBuilder<UInt64Type> UInt64Builder;
+using UInt8Builder = NumericBuilder<UInt8Type>;
+using UInt16Builder = NumericBuilder<UInt16Type>;
+using UInt32Builder = NumericBuilder<UInt32Type>;
+using UInt64Builder = NumericBuilder<UInt64Type>;
-typedef NumericBuilder<Int8Type> Int8Builder;
-typedef NumericBuilder<Int16Type> Int16Builder;
-typedef NumericBuilder<Int32Type> Int32Builder;
-typedef NumericBuilder<Int64Type> Int64Builder;
-typedef NumericBuilder<TimestampType> TimestampBuilder;
+using Int8Builder = NumericBuilder<Int8Type>;
+using Int16Builder = NumericBuilder<Int16Type>;
+using Int32Builder = NumericBuilder<Int32Type>;
+using Int64Builder = NumericBuilder<Int64Type>;
+using TimestampBuilder = NumericBuilder<TimestampType>;
-typedef NumericBuilder<FloatType> FloatBuilder;
-typedef NumericBuilder<DoubleType> DoubleBuilder;
+using HalfFloatBuilder = NumericBuilder<HalfFloatType>;
+using FloatBuilder = NumericBuilder<FloatType>;
+using DoubleBuilder = NumericBuilder<DoubleType>;
class ARROW_EXPORT BooleanArray : public PrimitiveArray {
public:
+ using TypeClass = BooleanType;
+
BooleanArray(int32_t length, const std::shared_ptr<Buffer>& data,
int32_t null_count = 0, const std::shared_ptr<Buffer>& null_bitmap = nullptr);
BooleanArray(const TypePtr& type, int32_t length, const std::shared_ptr<Buffer>& data,
@@ -288,28 +202,36 @@ class ARROW_EXPORT BooleanArray : public PrimitiveArray {
bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx,
const ArrayPtr& arr) const override;
+ Status Accept(ArrayVisitor* visitor) const override;
+
const uint8_t* raw_data() const { return reinterpret_cast<const uint8_t*>(raw_data_); }
bool Value(int i) const { return BitUtil::GetBit(raw_data(), i); }
};
-template <>
-struct TypeTraits<BooleanType> {
- typedef BooleanArray ArrayType;
-
- static inline int bytes_required(int elements) {
- return BitUtil::BytesForBits(elements);
- }
-};
-
-class ARROW_EXPORT BooleanBuilder : public PrimitiveBuilder<BooleanType> {
+class ARROW_EXPORT BooleanBuilder : public ArrayBuilder {
public:
explicit BooleanBuilder(MemoryPool* pool, const TypePtr& type)
- : PrimitiveBuilder<BooleanType>(pool, type) {}
+ : ArrayBuilder(pool, type), data_(nullptr) {}
virtual ~BooleanBuilder() {}
- using PrimitiveBuilder<BooleanType>::Append;
+ using ArrayBuilder::Advance;
+
+ // Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory
+ Status AppendNulls(const uint8_t* valid_bytes, int32_t length) {
+ RETURN_NOT_OK(Reserve(length));
+ UnsafeAppendToBitmap(valid_bytes, length);
+ return Status::OK();
+ }
+
+ Status AppendNull() {
+ RETURN_NOT_OK(Reserve(1));
+ UnsafeAppendToBitmap(false);
+ return Status::OK();
+ }
+
+ std::shared_ptr<Buffer> data() const { return data_; }
// Scalar append
Status Append(bool val) {
@@ -324,9 +246,39 @@ class ARROW_EXPORT BooleanBuilder : public PrimitiveBuilder<BooleanType> {
return Status::OK();
}
- Status Append(uint8_t val) { return Append(static_cast<bool>(val)); }
+ // Vector append
+ //
+ // If passed, valid_bytes is of equal length to values, and any zero byte
+ // will be considered as a null for that slot
+ Status Append(
+ const uint8_t* values, int32_t length, const uint8_t* valid_bytes = nullptr);
+
+ Status Finish(std::shared_ptr<Array>* out) override;
+ Status Init(int32_t capacity) override;
+
+ // Increase the capacity of the builder to accommodate at least the indicated
+ // number of elements
+ Status Resize(int32_t capacity) override;
+
+ protected:
+ std::shared_ptr<PoolBuffer> data_;
+ uint8_t* raw_data_;
};
+// Only instantiate these templates once
+extern template class ARROW_EXPORT NumericArray<Int8Type>;
+extern template class ARROW_EXPORT NumericArray<UInt8Type>;
+extern template class ARROW_EXPORT NumericArray<Int16Type>;
+extern template class ARROW_EXPORT NumericArray<UInt16Type>;
+extern template class ARROW_EXPORT NumericArray<Int32Type>;
+extern template class ARROW_EXPORT NumericArray<UInt32Type>;
+extern template class ARROW_EXPORT NumericArray<Int64Type>;
+extern template class ARROW_EXPORT NumericArray<UInt64Type>;
+extern template class ARROW_EXPORT NumericArray<HalfFloatType>;
+extern template class ARROW_EXPORT NumericArray<FloatType>;
+extern template class ARROW_EXPORT NumericArray<DoubleType>;
+extern template class ARROW_EXPORT NumericArray<TimestampType>;
+
} // namespace arrow
#endif // ARROW_TYPES_PRIMITIVE_H
http://git-wip-us.apache.org/repos/asf/arrow/blob/ed6ec3b7/cpp/src/arrow/types/string-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/string-test.cc b/cpp/src/arrow/types/string-test.cc
index af87a14..3c4b12b 100644
--- a/cpp/src/arrow/types/string-test.cc
+++ b/cpp/src/arrow/types/string-test.cc
@@ -47,7 +47,7 @@ TEST(TypesTest, BinaryType) {
TEST(TypesTest, TestStringType) {
StringType str;
ASSERT_EQ(str.type, Type::STRING);
- ASSERT_EQ(str.name(), std::string("string"));
+ ASSERT_EQ(str.ToString(), std::string("string"));
}
// ----------------------------------------------------------------------
@@ -66,8 +66,8 @@ class TestStringContainer : public ::testing::Test {
void MakeArray() {
length_ = offsets_.size() - 1;
- value_buf_ = test::to_buffer(chars_);
- offsets_buf_ = test::to_buffer(offsets_);
+ value_buf_ = test::GetBufferFromVector(chars_);
+ offsets_buf_ = test::GetBufferFromVector(offsets_);
null_bitmap_ = test::bytes_to_null_buffer(valid_bytes_);
null_count_ = test::null_count(valid_bytes_);
@@ -131,7 +131,7 @@ TEST_F(TestStringContainer, TestGetString) {
TEST_F(TestStringContainer, TestEmptyStringComparison) {
offsets_ = {0, 0, 0, 0, 0, 0};
- offsets_buf_ = test::to_buffer(offsets_);
+ offsets_buf_ = test::GetBufferFromVector(offsets_);
length_ = offsets_.size() - 1;
auto strings_a = std::make_shared<StringArray>(
@@ -227,8 +227,8 @@ class TestBinaryContainer : public ::testing::Test {
void MakeArray() {
length_ = offsets_.size() - 1;
- value_buf_ = test::to_buffer(chars_);
- offsets_buf_ = test::to_buffer(offsets_);
+ value_buf_ = test::GetBufferFromVector(chars_);
+ offsets_buf_ = test::GetBufferFromVector(offsets_);
null_bitmap_ = test::bytes_to_null_buffer(valid_bytes_);
null_count_ = test::null_count(valid_bytes_);
http://git-wip-us.apache.org/repos/asf/arrow/blob/ed6ec3b7/cpp/src/arrow/types/string.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/string.cc b/cpp/src/arrow/types/string.cc
index f6d26df..db963df 100644
--- a/cpp/src/arrow/types/string.cc
+++ b/cpp/src/arrow/types/string.cc
@@ -94,6 +94,10 @@ bool BinaryArray::RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_
return true;
}
+Status BinaryArray::Accept(ArrayVisitor* visitor) const {
+ return visitor->Visit(*this);
+}
+
StringArray::StringArray(int32_t length, const std::shared_ptr<Buffer>& offsets,
const std::shared_ptr<Buffer>& data, int32_t null_count,
const std::shared_ptr<Buffer>& null_bitmap)
@@ -104,6 +108,10 @@ Status StringArray::Validate() const {
return BinaryArray::Validate();
}
+Status StringArray::Accept(ArrayVisitor* visitor) const {
+ return visitor->Visit(*this);
+}
+
// This used to be a static member variable of BinaryBuilder, but it can cause
// valgrind to report a (spurious?) memory leak when needed in other shared
// libraries. The problem came up while adding explicit visibility to libarrow
@@ -122,8 +130,8 @@ Status BinaryBuilder::Finish(std::shared_ptr<Array>* out) {
const auto list = std::dynamic_pointer_cast<ListArray>(result);
auto values = std::dynamic_pointer_cast<UInt8Array>(list->values());
- *out = std::make_shared<BinaryArray>(list->length(), list->offset_buffer(),
- values->data(), list->null_count(), list->null_bitmap());
+ *out = std::make_shared<BinaryArray>(list->length(), list->offsets(), values->data(),
+ list->null_count(), list->null_bitmap());
return Status::OK();
}
@@ -134,8 +142,8 @@ Status StringBuilder::Finish(std::shared_ptr<Array>* out) {
const auto list = std::dynamic_pointer_cast<ListArray>(result);
auto values = std::dynamic_pointer_cast<UInt8Array>(list->values());
- *out = std::make_shared<StringArray>(list->length(), list->offset_buffer(),
- values->data(), list->null_count(), list->null_bitmap());
+ *out = std::make_shared<StringArray>(list->length(), list->offsets(), values->data(),
+ list->null_count(), list->null_bitmap());
return Status::OK();
}
http://git-wip-us.apache.org/repos/asf/arrow/blob/ed6ec3b7/cpp/src/arrow/types/string.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/string.h b/cpp/src/arrow/types/string.h
index aaba49c..c875243 100644
--- a/cpp/src/arrow/types/string.h
+++ b/cpp/src/arrow/types/string.h
@@ -37,6 +37,8 @@ class MemoryPool;
class ARROW_EXPORT BinaryArray : public Array {
public:
+ using TypeClass = BinaryType;
+
BinaryArray(int32_t length, const std::shared_ptr<Buffer>& offsets,
const std::shared_ptr<Buffer>& data, int32_t null_count = 0,
const std::shared_ptr<Buffer>& null_bitmap = nullptr);
@@ -60,6 +62,8 @@ class ARROW_EXPORT BinaryArray : public Array {
std::shared_ptr<Buffer> data() const { return data_buffer_; }
std::shared_ptr<Buffer> offsets() const { return offset_buffer_; }
+ const int32_t* raw_offsets() const { return offsets_; }
+
int32_t offset(int i) const { return offsets_[i]; }
// Neither of these functions will perform boundschecking
@@ -73,6 +77,8 @@ class ARROW_EXPORT BinaryArray : public Array {
Status Validate() const override;
+ Status Accept(ArrayVisitor* visitor) const override;
+
private:
std::shared_ptr<Buffer> offset_buffer_;
const int32_t* offsets_;
@@ -83,6 +89,8 @@ class ARROW_EXPORT BinaryArray : public Array {
class ARROW_EXPORT StringArray : public BinaryArray {
public:
+ using TypeClass = StringType;
+
StringArray(int32_t length, const std::shared_ptr<Buffer>& offsets,
const std::shared_ptr<Buffer>& data, int32_t null_count = 0,
const std::shared_ptr<Buffer>& null_bitmap = nullptr);
@@ -96,6 +104,8 @@ class ARROW_EXPORT StringArray : public BinaryArray {
}
Status Validate() const override;
+
+ Status Accept(ArrayVisitor* visitor) const override;
};
// BinaryBuilder : public ListBuilder
@@ -109,6 +119,12 @@ class ARROW_EXPORT BinaryBuilder : public ListBuilder {
return byte_builder_->Append(value, length);
}
+ Status Append(const char* value, int32_t length) {
+ return Append(reinterpret_cast<const uint8_t*>(value), length);
+ }
+
+ Status Append(const std::string& value) { return Append(value.c_str(), value.size()); }
+
Status Finish(std::shared_ptr<Array>* out) override;
protected:
@@ -121,13 +137,9 @@ class ARROW_EXPORT StringBuilder : public BinaryBuilder {
explicit StringBuilder(MemoryPool* pool, const TypePtr& type)
: BinaryBuilder(pool, type) {}
- Status Finish(std::shared_ptr<Array>* out) override;
-
- Status Append(const std::string& value) { return Append(value.c_str(), value.size()); }
+ using BinaryBuilder::Append;
- Status Append(const char* value, int32_t length) {
- return BinaryBuilder::Append(reinterpret_cast<const uint8_t*>(value), length);
- }
+ Status Finish(std::shared_ptr<Array>* out) override;
Status Append(const std::vector<std::string>& values, uint8_t* null_bytes);
};
http://git-wip-us.apache.org/repos/asf/arrow/blob/ed6ec3b7/cpp/src/arrow/types/struct-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/struct-test.cc b/cpp/src/arrow/types/struct-test.cc
index 8e82c38..197d7d4 100644
--- a/cpp/src/arrow/types/struct-test.cc
+++ b/cpp/src/arrow/types/struct-test.cc
@@ -80,7 +80,7 @@ void ValidateBasicStructArray(const StructArray* result,
ASSERT_EQ(4, list_char_arr->length());
ASSERT_EQ(10, list_char_arr->values()->length());
for (size_t i = 0; i < list_offsets.size(); ++i) {
- ASSERT_EQ(list_offsets[i], list_char_arr->offsets()[i]);
+ ASSERT_EQ(list_offsets[i], list_char_arr->raw_offsets()[i]);
}
for (size_t i = 0; i < list_values.size(); ++i) {
ASSERT_EQ(list_values[i], char_arr->Value(i));
http://git-wip-us.apache.org/repos/asf/arrow/blob/ed6ec3b7/cpp/src/arrow/types/struct.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/struct.cc b/cpp/src/arrow/types/struct.cc
index 369c29d..0e0db23 100644
--- a/cpp/src/arrow/types/struct.cc
+++ b/cpp/src/arrow/types/struct.cc
@@ -87,6 +87,10 @@ Status StructArray::Validate() const {
return Status::OK();
}
+Status StructArray::Accept(ArrayVisitor* visitor) const {
+ return visitor->Visit(*this);
+}
+
Status StructBuilder::Finish(std::shared_ptr<Array>* out) {
std::vector<std::shared_ptr<Array>> fields(field_builders_.size());
for (size_t i = 0; i < field_builders_.size(); ++i) {
http://git-wip-us.apache.org/repos/asf/arrow/blob/ed6ec3b7/cpp/src/arrow/types/struct.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/struct.h b/cpp/src/arrow/types/struct.h
index 65b8daf..035af05 100644
--- a/cpp/src/arrow/types/struct.h
+++ b/cpp/src/arrow/types/struct.h
@@ -31,6 +31,8 @@ namespace arrow {
class ARROW_EXPORT StructArray : public Array {
public:
+ using TypeClass = StructType;
+
StructArray(const TypePtr& type, int32_t length, std::vector<ArrayPtr>& field_arrays,
int32_t null_count = 0, std::shared_ptr<Buffer> null_bitmap = nullptr)
: Array(type, length, null_count, null_bitmap) {
@@ -55,6 +57,8 @@ class ARROW_EXPORT StructArray : public Array {
bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx,
const std::shared_ptr<Array>& arr) const override;
+ Status Accept(ArrayVisitor* visitor) const override;
+
protected:
// The child arrays corresponding to each field of the struct data type.
std::vector<ArrayPtr> field_arrays_;
http://git-wip-us.apache.org/repos/asf/arrow/blob/ed6ec3b7/cpp/src/arrow/types/test-common.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/test-common.h b/cpp/src/arrow/types/test-common.h
index 1957636..6e6ab85 100644
--- a/cpp/src/arrow/types/test-common.h
+++ b/cpp/src/arrow/types/test-common.h
@@ -24,6 +24,8 @@
#include "gtest/gtest.h"
+#include "arrow/array.h"
+#include "arrow/builder.h"
#include "arrow/test-util.h"
#include "arrow/type.h"
#include "arrow/util/memory-pool.h"
@@ -49,6 +51,20 @@ class TestBuilder : public ::testing::Test {
unique_ptr<ArrayBuilder> builder_nn_;
};
+template <class T, class Builder>
+Status MakeArray(const std::vector<uint8_t>& valid_bytes, const std::vector<T>& values,
+ int size, Builder* builder, ArrayPtr* out) {
+ // Append the first 1000
+ for (int i = 0; i < size; ++i) {
+ if (valid_bytes[i] > 0) {
+ RETURN_NOT_OK(builder->Append(values[i]));
+ } else {
+ RETURN_NOT_OK(builder->AppendNull());
+ }
+ }
+ return builder->Finish(out);
+}
+
} // namespace arrow
#endif // ARROW_TYPES_TEST_COMMON_H
http://git-wip-us.apache.org/repos/asf/arrow/blob/ed6ec3b7/cpp/src/arrow/types/union.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/union.cc b/cpp/src/arrow/types/union.cc
index c891b4a..cc2934b 100644
--- a/cpp/src/arrow/types/union.cc
+++ b/cpp/src/arrow/types/union.cc
@@ -24,25 +24,4 @@
#include "arrow/type.h"
-namespace arrow {
-
-static inline std::string format_union(const std::vector<TypePtr>& child_types) {
- std::stringstream s;
- s << "union<";
- for (size_t i = 0; i < child_types.size(); ++i) {
- if (i) { s << ", "; }
- s << child_types[i]->ToString();
- }
- s << ">";
- return s.str();
-}
-
-std::string DenseUnionType::ToString() const {
- return format_union(child_types_);
-}
-
-std::string SparseUnionType::ToString() const {
- return format_union(child_types_);
-}
-
-} // namespace arrow
+namespace arrow {} // namespace arrow
http://git-wip-us.apache.org/repos/asf/arrow/blob/ed6ec3b7/cpp/src/arrow/types/union.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/types/union.h b/cpp/src/arrow/types/union.h
index d2ee9bd..44f39cc 100644
--- a/cpp/src/arrow/types/union.h
+++ b/cpp/src/arrow/types/union.h
@@ -24,32 +24,11 @@
#include "arrow/array.h"
#include "arrow/type.h"
-#include "arrow/types/collection.h"
namespace arrow {
class Buffer;
-struct DenseUnionType : public CollectionType<Type::DENSE_UNION> {
- typedef CollectionType<Type::DENSE_UNION> Base;
-
- explicit DenseUnionType(const std::vector<TypePtr>& child_types) : Base() {
- child_types_ = child_types;
- }
-
- virtual std::string ToString() const;
-};
-
-struct SparseUnionType : public CollectionType<Type::SPARSE_UNION> {
- typedef CollectionType<Type::SPARSE_UNION> Base;
-
- explicit SparseUnionType(const std::vector<TypePtr>& child_types) : Base() {
- child_types_ = child_types;
- }
-
- virtual std::string ToString() const;
-};
-
class UnionArray : public Array {
protected:
// The data are types encoded as int16
http://git-wip-us.apache.org/repos/asf/arrow/blob/ed6ec3b7/cpp/src/arrow/util/logging.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/util/logging.h b/cpp/src/arrow/util/logging.h
index 06ee841..b22f07d 100644
--- a/cpp/src/arrow/util/logging.h
+++ b/cpp/src/arrow/util/logging.h
@@ -118,9 +118,9 @@ class CerrLog {
class FatalLog : public CerrLog {
public:
explicit FatalLog(int /* severity */) // NOLINT
- : CerrLog(ARROW_FATAL) {} // NOLINT
+ : CerrLog(ARROW_FATAL){} // NOLINT
- [[noreturn]] ~FatalLog() {
+ [[noreturn]] ~FatalLog() {
if (has_logged_) { std::cerr << std::endl; }
std::exit(1);
}
http://git-wip-us.apache.org/repos/asf/arrow/blob/ed6ec3b7/format/Metadata.md
----------------------------------------------------------------------
diff --git a/format/Metadata.md b/format/Metadata.md
index 653a4c7..a4878f3 100644
--- a/format/Metadata.md
+++ b/format/Metadata.md
@@ -98,6 +98,11 @@ Union:
"typeIds" : [ /* integer */ ]
}
```
+
+The `typeIds` field in the Union are the codes used to denote each type, which
+may be different from the index of the child array. This is so that the union
+type ids do not have to be enumerated from 0.
+
Int:
```
{
[2/2] arrow git commit: ARROW-373: [C++] JSON serialization format
for testing
Posted by we...@apache.org.
ARROW-373: [C++] JSON serialization format for testing
C++ version of ARROW-372
Author: Wes McKinney <we...@twosigma.com>
Closes #202 from wesm/ARROW-373 and squashes the following commits:
d13a05f [Wes McKinney] Compiler warning
72c24fe [Wes McKinney] Add a minimal literal JSON example
a2cf47b [Wes McKinney] cpplint
3d9fcc2 [Wes McKinney] Complete round trip json file test with multiple record batches
2753449 [Wes McKinney] Complete draft json roundtrip implementation. tests not complete yet
3d6bbbd [Wes McKinney] Start high level writer scaffold
6bbd669 [Wes McKinney] Tweaks
e2e86b5 [Wes McKinney] Test JSON array roundtrip for numeric types, strings, lists, structs
82f108b [Wes McKinney] Refactoring. Array test scaffold
0891378 [Wes McKinney] Declare loop variables
6566343 [Wes McKinney] Recursively construct children for list/struct
35c2f85 [Wes McKinney] Refactoring. Start drafting string/list reader
f26402a [Wes McKinney] Install type_traits.h. cpplint
4fc7294 [Wes McKinney] Refactoring, type attribute consistency. Array reader compiles
2c93cce [Wes McKinney] WIP JSON array reader code path
932ba7a [Wes McKinney] Add ArrayVisitor methods, add enough metaprogramming to detect presence of c_type type member
15c1094 [Wes McKinney] Add type traits, refactoring, drafting json array writing. not working yet
209ba48 [Wes McKinney] More types refactoring. Strange linker error in pyarrow
379da3c [Wes McKinney] Implement union metadata JSON serialization
5fbea41 [Wes McKinney] Implement some more json types and add convenience factory functions
1c08233 [Wes McKinney] JSON schema roundtrip passing for many types
86c9559 [Wes McKinney] Add convenience factory functions for common types
3b9d14e [Wes McKinney] Add type-specific JSON metadata to schema writer
820b0f2 [Wes McKinney] Drafting JSON schema read/write
68ee7ab [Wes McKinney] Move forward declarations into type_fwd.h
1edf2a9 [Wes McKinney] Prototyping out visitor pattern for json serialization
24c1d5d [Wes McKinney] Some Types refactoring, add TypeVisitor abstract class. Add RapidJSON as external project
Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/ed6ec3b7
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/ed6ec3b7
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/ed6ec3b7
Branch: refs/heads/master
Commit: ed6ec3b76e1ac27fab85cd4bc74fbd61e8dfb27f
Parents: 8417096
Author: Wes McKinney <we...@twosigma.com>
Authored: Fri Nov 18 14:58:46 2016 -0500
Committer: Wes McKinney <we...@twosigma.com>
Committed: Fri Nov 18 14:58:46 2016 -0500
----------------------------------------------------------------------
cpp/CMakeLists.txt | 19 +
cpp/src/arrow/CMakeLists.txt | 2 +
cpp/src/arrow/array.cc | 15 +
cpp/src/arrow/array.h | 12 +
cpp/src/arrow/column-test.cc | 1 +
cpp/src/arrow/io/hdfs.cc | 8 +-
cpp/src/arrow/io/libhdfs_shim.cc | 26 +-
cpp/src/arrow/ipc/CMakeLists.txt | 7 +
cpp/src/arrow/ipc/adapter.cc | 2 +-
cpp/src/arrow/ipc/ipc-json-test.cc | 353 +++++++++
cpp/src/arrow/ipc/json-internal.cc | 1113 ++++++++++++++++++++++++++++
cpp/src/arrow/ipc/json-internal.h | 111 +++
cpp/src/arrow/ipc/json.cc | 219 ++++++
cpp/src/arrow/ipc/json.h | 92 +++
cpp/src/arrow/ipc/test-common.h | 14 +-
cpp/src/arrow/schema-test.cc | 52 +-
cpp/src/arrow/schema.cc | 15 +
cpp/src/arrow/schema.h | 12 +-
cpp/src/arrow/test-util.h | 51 +-
cpp/src/arrow/type.cc | 122 ++-
cpp/src/arrow/type.h | 338 +++++++--
cpp/src/arrow/type_fwd.h | 157 ++++
cpp/src/arrow/type_traits.h | 197 +++++
cpp/src/arrow/types/CMakeLists.txt | 1 -
cpp/src/arrow/types/collection.h | 41 -
cpp/src/arrow/types/datetime.h | 37 +-
cpp/src/arrow/types/decimal.h | 14 +-
cpp/src/arrow/types/list-test.cc | 2 +-
cpp/src/arrow/types/list.cc | 4 +
cpp/src/arrow/types/list.h | 8 +-
cpp/src/arrow/types/primitive-test.cc | 36 +-
cpp/src/arrow/types/primitive.cc | 97 ++-
cpp/src/arrow/types/primitive.h | 190 ++---
cpp/src/arrow/types/string-test.cc | 12 +-
cpp/src/arrow/types/string.cc | 16 +-
cpp/src/arrow/types/string.h | 24 +-
cpp/src/arrow/types/struct-test.cc | 2 +-
cpp/src/arrow/types/struct.cc | 4 +
cpp/src/arrow/types/struct.h | 4 +
cpp/src/arrow/types/test-common.h | 16 +
cpp/src/arrow/types/union.cc | 23 +-
cpp/src/arrow/types/union.h | 21 -
cpp/src/arrow/util/logging.h | 4 +-
format/Metadata.md | 5 +
44 files changed, 3049 insertions(+), 450 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/arrow/blob/ed6ec3b7/cpp/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 6f95483..0bff752 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -545,6 +545,25 @@ if(ARROW_BUILD_BENCHMARKS)
endif()
endif()
+# RapidJSON, header only dependency
+if("$ENV{RAPIDJSON_HOME}" STREQUAL "")
+ ExternalProject_Add(rapidjson_ep
+ PREFIX "${CMAKE_BINARY_DIR}"
+ URL "https://github.com/miloyip/rapidjson/archive/v1.1.0.tar.gz"
+ URL_MD5 "badd12c511e081fec6c89c43a7027bce"
+ CONFIGURE_COMMAND ""
+ BUILD_COMMAND ""
+ BUILD_IN_SOURCE 1
+ INSTALL_COMMAND "")
+
+ ExternalProject_Get_Property(rapidjson_ep SOURCE_DIR)
+ set(RAPIDJSON_INCLUDE_DIR "${SOURCE_DIR}/include")
+else()
+ set(RAPIDJSON_INCLUDE_DIR "$ENV{RAPIDJSON_HOME}/include")
+endif()
+message(STATUS "RapidJSON include dir: ${RAPIDJSON_INCLUDE_DIR}")
+include_directories(SYSTEM ${RAPIDJSON_INCLUDE_DIR})
+
## Google PerfTools
##
## Disabled with TSAN/ASAN as well as with gold+dynamic linking (see comment
http://git-wip-us.apache.org/repos/asf/arrow/blob/ed6ec3b7/cpp/src/arrow/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index a9b2fec..81851bc 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -24,6 +24,8 @@ install(FILES
schema.h
table.h
type.h
+ type_fwd.h
+ type_traits.h
test-util.h
DESTINATION include/arrow)
http://git-wip-us.apache.org/repos/asf/arrow/blob/ed6ec3b7/cpp/src/arrow/array.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc
index e432a53..3262425 100644
--- a/cpp/src/arrow/array.cc
+++ b/cpp/src/arrow/array.cc
@@ -18,6 +18,7 @@
#include "arrow/array.h"
#include <cstdint>
+#include <cstring>
#include "arrow/util/bit-util.h"
#include "arrow/util/buffer.h"
@@ -25,6 +26,16 @@
namespace arrow {
+Status GetEmptyBitmap(
+ MemoryPool* pool, int32_t length, std::shared_ptr<MutableBuffer>* result) {
+ auto buffer = std::make_shared<PoolBuffer>(pool);
+ RETURN_NOT_OK(buffer->Resize(BitUtil::BytesForBits(length)));
+ memset(buffer->mutable_data(), 0, buffer->size());
+
+ *result = buffer;
+ return Status::OK();
+}
+
// ----------------------------------------------------------------------
// Base array class
@@ -66,4 +77,8 @@ bool NullArray::RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_st
return true;
}
+Status NullArray::Accept(ArrayVisitor* visitor) const {
+ return visitor->Visit(*this);
+}
+
} // namespace arrow
http://git-wip-us.apache.org/repos/asf/arrow/blob/ed6ec3b7/cpp/src/arrow/array.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h
index ff37323..ff2b70e 100644
--- a/cpp/src/arrow/array.h
+++ b/cpp/src/arrow/array.h
@@ -29,6 +29,8 @@
namespace arrow {
class Buffer;
+class MemoryPool;
+class MutableBuffer;
class Status;
// Immutable data array with some logical type and some length. Any memory is
@@ -70,6 +72,8 @@ class ARROW_EXPORT Array {
// returning Status::OK. This can be an expensive check.
virtual Status Validate() const;
+ virtual Status Accept(ArrayVisitor* visitor) const = 0;
+
protected:
std::shared_ptr<DataType> type_;
int32_t null_count_;
@@ -86,6 +90,8 @@ class ARROW_EXPORT Array {
// Degenerate null type Array
class ARROW_EXPORT NullArray : public Array {
public:
+ using TypeClass = NullType;
+
NullArray(const std::shared_ptr<DataType>& type, int32_t length)
: Array(type, length, length, nullptr) {}
@@ -94,9 +100,15 @@ class ARROW_EXPORT NullArray : public Array {
bool Equals(const std::shared_ptr<Array>& arr) const override;
bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_index,
const std::shared_ptr<Array>& arr) const override;
+
+ Status Accept(ArrayVisitor* visitor) const override;
};
typedef std::shared_ptr<Array> ArrayPtr;
+
+Status ARROW_EXPORT GetEmptyBitmap(
+ MemoryPool* pool, int32_t length, std::shared_ptr<MutableBuffer>* result);
+
} // namespace arrow
#endif
http://git-wip-us.apache.org/repos/asf/arrow/blob/ed6ec3b7/cpp/src/arrow/column-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/column-test.cc b/cpp/src/arrow/column-test.cc
index 1edf313..ac3636d 100644
--- a/cpp/src/arrow/column-test.cc
+++ b/cpp/src/arrow/column-test.cc
@@ -22,6 +22,7 @@
#include "gtest/gtest.h"
+#include "arrow/array.h"
#include "arrow/column.h"
#include "arrow/schema.h"
#include "arrow/test-util.h"
http://git-wip-us.apache.org/repos/asf/arrow/blob/ed6ec3b7/cpp/src/arrow/io/hdfs.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/io/hdfs.cc b/cpp/src/arrow/io/hdfs.cc
index 6490a75..13491e7 100644
--- a/cpp/src/arrow/io/hdfs.cc
+++ b/cpp/src/arrow/io/hdfs.cc
@@ -289,13 +289,9 @@ class HdfsClient::HdfsClientImpl {
// connect to HDFS with the builder object
hdfsBuilder* builder = hdfsNewBuilder();
- if (!config->host.empty()) {
- hdfsBuilderSetNameNode(builder, config->host.c_str());
- }
+ if (!config->host.empty()) { hdfsBuilderSetNameNode(builder, config->host.c_str()); }
hdfsBuilderSetNameNodePort(builder, config->port);
- if (!config->user.empty()) {
- hdfsBuilderSetUserName(builder, config->user.c_str());
- }
+ if (!config->user.empty()) { hdfsBuilderSetUserName(builder, config->user.c_str()); }
if (!config->kerb_ticket.empty()) {
hdfsBuilderSetKerbTicketCachePath(builder, config->kerb_ticket.c_str());
}
http://git-wip-us.apache.org/repos/asf/arrow/blob/ed6ec3b7/cpp/src/arrow/io/libhdfs_shim.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/io/libhdfs_shim.cc b/cpp/src/arrow/io/libhdfs_shim.cc
index 1fee595..36b8a4e 100644
--- a/cpp/src/arrow/io/libhdfs_shim.cc
+++ b/cpp/src/arrow/io/libhdfs_shim.cc
@@ -74,12 +74,9 @@ static HINSTANCE libjvm_handle = NULL;
// NOTE(wesm): cpplint does not like use of short and other imprecise C types
static hdfsBuilder* (*ptr_hdfsNewBuilder)(void) = NULL;
-static void (*ptr_hdfsBuilderSetNameNode)(
- hdfsBuilder* bld, const char* nn) = NULL;
-static void (*ptr_hdfsBuilderSetNameNodePort)(
- hdfsBuilder* bld, tPort port) = NULL;
-static void (*ptr_hdfsBuilderSetUserName)(
- hdfsBuilder* bld, const char* userName) = NULL;
+static void (*ptr_hdfsBuilderSetNameNode)(hdfsBuilder* bld, const char* nn) = NULL;
+static void (*ptr_hdfsBuilderSetNameNodePort)(hdfsBuilder* bld, tPort port) = NULL;
+static void (*ptr_hdfsBuilderSetUserName)(hdfsBuilder* bld, const char* userName) = NULL;
static void (*ptr_hdfsBuilderSetKerbTicketCachePath)(
hdfsBuilder* bld, const char* kerbTicketCachePath) = NULL;
static hdfsFS (*ptr_hdfsBuilderConnect)(hdfsBuilder* bld) = NULL;
@@ -173,9 +170,9 @@ void hdfsBuilderSetUserName(hdfsBuilder* bld, const char* userName) {
ptr_hdfsBuilderSetUserName(bld, userName);
}
-void hdfsBuilderSetKerbTicketCachePath(hdfsBuilder* bld,
- const char* kerbTicketCachePath) {
- ptr_hdfsBuilderSetKerbTicketCachePath(bld , kerbTicketCachePath);
+void hdfsBuilderSetKerbTicketCachePath(
+ hdfsBuilder* bld, const char* kerbTicketCachePath) {
+ ptr_hdfsBuilderSetKerbTicketCachePath(bld, kerbTicketCachePath);
}
hdfsFS hdfsBuilderConnect(hdfsBuilder* bld) {
@@ -364,7 +361,7 @@ static std::vector<fs::path> get_potential_libhdfs_paths() {
std::vector<fs::path> libhdfs_potential_paths;
std::string file_name;
- // OS-specific file name
+// OS-specific file name
#ifdef __WIN32
file_name = "hdfs.dll";
#elif __APPLE__
@@ -374,10 +371,7 @@ static std::vector<fs::path> get_potential_libhdfs_paths() {
#endif
// Common paths
- std::vector<fs::path> search_paths = {
- fs::path(""),
- fs::path(".")
- };
+ std::vector<fs::path> search_paths = {fs::path(""), fs::path(".")};
// Path from environment variable
const char* hadoop_home = std::getenv("HADOOP_HOME");
@@ -387,9 +381,7 @@ static std::vector<fs::path> get_potential_libhdfs_paths() {
}
const char* libhdfs_dir = std::getenv("ARROW_LIBHDFS_DIR");
- if (libhdfs_dir != nullptr) {
- search_paths.push_back(fs::path(libhdfs_dir));
- }
+ if (libhdfs_dir != nullptr) { search_paths.push_back(fs::path(libhdfs_dir)); }
// All paths with file name
for (auto& path : search_paths) {
http://git-wip-us.apache.org/repos/asf/arrow/blob/ed6ec3b7/cpp/src/arrow/ipc/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/CMakeLists.txt b/cpp/src/arrow/ipc/CMakeLists.txt
index d2db339..6955bcb 100644
--- a/cpp/src/arrow/ipc/CMakeLists.txt
+++ b/cpp/src/arrow/ipc/CMakeLists.txt
@@ -34,6 +34,8 @@ set(ARROW_IPC_TEST_LINK_LIBS
set(ARROW_IPC_SRCS
adapter.cc
file.cc
+ json.cc
+ json-internal.cc
metadata.cc
metadata-internal.cc
)
@@ -79,6 +81,10 @@ ADD_ARROW_TEST(ipc-metadata-test)
ARROW_TEST_LINK_LIBRARIES(ipc-metadata-test
${ARROW_IPC_TEST_LINK_LIBS})
+ADD_ARROW_TEST(ipc-json-test)
+ARROW_TEST_LINK_LIBRARIES(ipc-json-test
+ ${ARROW_IPC_TEST_LINK_LIBS})
+
# make clean will delete the generated file
set_source_files_properties(Metadata_generated.h PROPERTIES GENERATED TRUE)
@@ -114,6 +120,7 @@ add_dependencies(arrow_objlib metadata_fbs)
install(FILES
adapter.h
file.h
+ json.h
metadata.h
DESTINATION include/arrow/ipc)
http://git-wip-us.apache.org/repos/asf/arrow/blob/ed6ec3b7/cpp/src/arrow/ipc/adapter.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/adapter.cc b/cpp/src/arrow/ipc/adapter.cc
index 74786bf..da718c0 100644
--- a/cpp/src/arrow/ipc/adapter.cc
+++ b/cpp/src/arrow/ipc/adapter.cc
@@ -106,7 +106,7 @@ Status VisitArray(const Array* arr, std::vector<flatbuf::FieldNode>* field_nodes
buffers->push_back(binary_arr->data());
} else if (arr->type_enum() == Type::LIST) {
const auto list_arr = static_cast<const ListArray*>(arr);
- buffers->push_back(list_arr->offset_buffer());
+ buffers->push_back(list_arr->offsets());
RETURN_NOT_OK(VisitArray(
list_arr->values().get(), field_nodes, buffers, max_recursion_depth - 1));
} else if (arr->type_enum() == Type::STRUCT) {
http://git-wip-us.apache.org/repos/asf/arrow/blob/ed6ec3b7/cpp/src/arrow/ipc/ipc-json-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/ipc-json-test.cc b/cpp/src/arrow/ipc/ipc-json-test.cc
new file mode 100644
index 0000000..a51371c
--- /dev/null
+++ b/cpp/src/arrow/ipc/ipc-json-test.cc
@@ -0,0 +1,353 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "arrow/array.h"
+#include "arrow/ipc/json-internal.h"
+#include "arrow/ipc/json.h"
+#include "arrow/table.h"
+#include "arrow/test-util.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/types/primitive.h"
+#include "arrow/types/string.h"
+#include "arrow/types/struct.h"
+#include "arrow/util/memory-pool.h"
+#include "arrow/util/status.h"
+
+namespace arrow {
+namespace ipc {
+
+void TestSchemaRoundTrip(const Schema& schema) {
+ rj::StringBuffer sb;
+ rj::Writer<rj::StringBuffer> writer(sb);
+
+ ASSERT_OK(WriteJsonSchema(schema, &writer));
+
+ rj::Document d;
+ d.Parse(sb.GetString());
+
+ std::shared_ptr<Schema> out;
+ ASSERT_OK(ReadJsonSchema(d, &out));
+
+ ASSERT_TRUE(schema.Equals(out));
+}
+
+void TestArrayRoundTrip(const Array& array) {
+ static std::string name = "dummy";
+
+ rj::StringBuffer sb;
+ rj::Writer<rj::StringBuffer> writer(sb);
+
+ ASSERT_OK(WriteJsonArray(name, array, &writer));
+
+ std::string array_as_json = sb.GetString();
+
+ rj::Document d;
+ d.Parse(array_as_json);
+
+ if (d.HasParseError()) { FAIL() << "JSON parsing failed"; }
+
+ std::shared_ptr<Array> out;
+ ASSERT_OK(ReadJsonArray(default_memory_pool(), d, array.type(), &out));
+
+ ASSERT_TRUE(array.Equals(out)) << array_as_json;
+}
+
+template <typename T, typename ValueType>
+void CheckPrimitive(const std::shared_ptr<DataType>& type,
+ const std::vector<bool>& is_valid, const std::vector<ValueType>& values) {
+ MemoryPool* pool = default_memory_pool();
+ typename TypeTraits<T>::BuilderType builder(pool, type);
+
+ for (size_t i = 0; i < values.size(); ++i) {
+ if (is_valid[i]) {
+ ASSERT_OK(builder.Append(values[i]));
+ } else {
+ ASSERT_OK(builder.AppendNull());
+ }
+ }
+
+ std::shared_ptr<Array> array;
+ ASSERT_OK(builder.Finish(&array));
+ TestArrayRoundTrip(*array.get());
+}
+
+template <typename TYPE, typename C_TYPE>
+void MakeArray(const std::shared_ptr<DataType>& type, const std::vector<bool>& is_valid,
+ const std::vector<C_TYPE>& values, std::shared_ptr<Array>* out) {
+ std::shared_ptr<Buffer> values_buffer;
+ std::shared_ptr<Buffer> values_bitmap;
+
+ ASSERT_OK(test::CopyBufferFromVector(values, &values_buffer));
+ ASSERT_OK(test::GetBitmapFromBoolVector(is_valid, &values_bitmap));
+
+ using ArrayType = typename TypeTraits<TYPE>::ArrayType;
+
+ int32_t null_count = 0;
+ for (bool val : is_valid) {
+ if (!val) { ++null_count; }
+ }
+
+ *out = std::make_shared<ArrayType>(type, static_cast<int32_t>(values.size()),
+ values_buffer, null_count, values_bitmap);
+}
+
+TEST(TestJsonSchemaWriter, FlatTypes) {
+ std::vector<std::shared_ptr<Field>> fields = {field("f0", int8()),
+ field("f1", int16(), false), field("f2", int32()), field("f3", int64(), false),
+ field("f4", uint8()), field("f5", uint16()), field("f6", uint32()),
+ field("f7", uint64()), field("f8", float32()), field("f9", float64()),
+ field("f10", utf8()), field("f11", binary()), field("f12", list(int32())),
+ field("f13", struct_({field("s1", int32()), field("s2", utf8())})),
+ field("f14", date()), field("f15", timestamp(TimeUnit::NANO)),
+ field("f16", time(TimeUnit::MICRO)),
+ field("f17", union_({field("u1", int8()), field("u2", time(TimeUnit::MILLI))},
+ {0, 1}, UnionMode::DENSE))};
+
+ Schema schema(fields);
+ TestSchemaRoundTrip(schema);
+}
+
+template <typename T>
+void PrimitiveTypesCheckOne() {
+ using c_type = typename T::c_type;
+
+ std::vector<bool> is_valid = {true, false, true, true, true, false, true, true};
+ std::vector<c_type> values = {0, 1, 2, 3, 4, 5, 6, 7};
+ CheckPrimitive<T, c_type>(std::make_shared<T>(), is_valid, values);
+}
+
+TEST(TestJsonArrayWriter, PrimitiveTypes) {
+ PrimitiveTypesCheckOne<Int8Type>();
+ PrimitiveTypesCheckOne<Int16Type>();
+ PrimitiveTypesCheckOne<Int32Type>();
+ PrimitiveTypesCheckOne<Int64Type>();
+ PrimitiveTypesCheckOne<UInt8Type>();
+ PrimitiveTypesCheckOne<UInt16Type>();
+ PrimitiveTypesCheckOne<UInt32Type>();
+ PrimitiveTypesCheckOne<UInt64Type>();
+ PrimitiveTypesCheckOne<FloatType>();
+ PrimitiveTypesCheckOne<DoubleType>();
+
+ std::vector<bool> is_valid = {true, false, true, true, true, false, true, true};
+ std::vector<std::string> values = {"foo", "bar", "", "baz", "qux", "foo", "a", "1"};
+
+ CheckPrimitive<StringType, std::string>(utf8(), is_valid, values);
+ CheckPrimitive<BinaryType, std::string>(binary(), is_valid, values);
+}
+
+TEST(TestJsonArrayWriter, NestedTypes) {
+ auto value_type = int32();
+
+ std::vector<bool> values_is_valid = {true, false, true, true, false, true, true};
+ std::vector<int32_t> values = {0, 1, 2, 3, 4, 5, 6};
+
+ std::shared_ptr<Array> values_array;
+ MakeArray<Int32Type, int32_t>(int32(), values_is_valid, values, &values_array);
+
+ // List
+ std::vector<bool> list_is_valid = {true, false, true, true, true};
+ std::vector<int32_t> offsets = {0, 0, 0, 1, 4, 7};
+
+ std::shared_ptr<Buffer> list_bitmap;
+ ASSERT_OK(test::GetBitmapFromBoolVector(list_is_valid, &list_bitmap));
+ std::shared_ptr<Buffer> offsets_buffer = test::GetBufferFromVector(offsets);
+
+ ListArray list_array(list(value_type), 5, offsets_buffer, values_array, 1, list_bitmap);
+
+ TestArrayRoundTrip(list_array);
+
+ // Struct
+ std::vector<bool> struct_is_valid = {true, false, true, true, true, false, true};
+ std::shared_ptr<Buffer> struct_bitmap;
+ ASSERT_OK(test::GetBitmapFromBoolVector(struct_is_valid, &struct_bitmap));
+
+ auto struct_type =
+ struct_({field("f1", int32()), field("f2", int32()), field("f3", int32())});
+
+ std::vector<std::shared_ptr<Array>> fields = {values_array, values_array, values_array};
+ StructArray struct_array(
+ struct_type, static_cast<int>(struct_is_valid.size()), fields, 2, struct_bitmap);
+ TestArrayRoundTrip(struct_array);
+}
+
+// Data generation for test case below
+void MakeBatchArrays(const std::shared_ptr<Schema>& schema, const int num_rows,
+ std::vector<std::shared_ptr<Array>>* arrays) {
+ std::vector<bool> is_valid;
+ test::random_is_valid(num_rows, 0.25, &is_valid);
+
+ std::vector<int8_t> v1_values;
+ std::vector<int32_t> v2_values;
+
+ test::randint<int8_t>(num_rows, 0, 100, &v1_values);
+ test::randint<int32_t>(num_rows, 0, 100, &v2_values);
+
+ std::shared_ptr<Array> v1;
+ MakeArray<Int8Type, int8_t>(schema->field(0)->type, is_valid, v1_values, &v1);
+
+ std::shared_ptr<Array> v2;
+ MakeArray<Int32Type, int32_t>(schema->field(1)->type, is_valid, v2_values, &v2);
+
+ static const int kBufferSize = 10;
+ static uint8_t buffer[kBufferSize];
+ static uint32_t seed = 0;
+ StringBuilder string_builder(default_memory_pool(), utf8());
+ for (int i = 0; i < num_rows; ++i) {
+ if (!is_valid[i]) {
+ string_builder.AppendNull();
+ } else {
+ test::random_ascii(kBufferSize, seed++, buffer);
+ string_builder.Append(buffer, kBufferSize);
+ }
+ }
+ std::shared_ptr<Array> v3;
+ ASSERT_OK(string_builder.Finish(&v3));
+
+ arrays->emplace_back(v1);
+ arrays->emplace_back(v2);
+ arrays->emplace_back(v3);
+}
+
+TEST(TestJsonFileReadWrite, BasicRoundTrip) {
+ auto v1_type = int8();
+ auto v2_type = int32();
+ auto v3_type = utf8();
+
+ std::shared_ptr<Schema> schema(
+ new Schema({field("f1", v1_type), field("f2", v2_type), field("f3", v3_type)}));
+
+ std::unique_ptr<JsonWriter> writer;
+ ASSERT_OK(JsonWriter::Open(schema, &writer));
+
+ const int nbatches = 3;
+ std::vector<std::shared_ptr<RecordBatch>> batches;
+ for (int i = 0; i < nbatches; ++i) {
+ int32_t num_rows = 5 + i * 5;
+ std::vector<std::shared_ptr<Array>> arrays;
+
+ MakeBatchArrays(schema, num_rows, &arrays);
+ batches.emplace_back(std::make_shared<RecordBatch>(schema, num_rows, arrays));
+ ASSERT_OK(writer->WriteRecordBatch(arrays, num_rows));
+ }
+
+ std::string result;
+ ASSERT_OK(writer->Finish(&result));
+
+ std::unique_ptr<JsonReader> reader;
+
+ auto buffer = std::make_shared<Buffer>(
+ reinterpret_cast<const uint8_t*>(result.c_str()), static_cast<int>(result.size()));
+
+ ASSERT_OK(JsonReader::Open(buffer, &reader));
+ ASSERT_TRUE(reader->schema()->Equals(*schema.get()));
+
+ ASSERT_EQ(nbatches, reader->num_record_batches());
+
+ for (int i = 0; i < nbatches; ++i) {
+ std::shared_ptr<RecordBatch> batch;
+ ASSERT_OK(reader->GetRecordBatch(i, &batch));
+ ASSERT_TRUE(batch->Equals(*batches[i].get()));
+ }
+}
+
+TEST(TestJsonFileReadWrite, MinimalFormatExample) {
+ static const char* example = R"example(
+{
+ "schema": {
+ "fields": [
+ {
+ "name": "foo",
+ "type": {"name": "int", "isSigned": true, "bitWidth": 32},
+ "nullable": true, "children": [],
+ "typeLayout": [
+ {"type": "VALIDITY", "typeBitWidth": 1},
+ {"type": "DATA", "typeBitWidth": 32}
+ ]
+ },
+ {
+ "name": "bar",
+ "type": {"name": "floatingpoint", "precision": "DOUBLE"},
+ "nullable": true, "children": [],
+ "typeLayout": [
+ {"type": "VALIDITY", "typeBitWidth": 1},
+ {"type": "DATA", "typeBitWidth": 64}
+ ]
+ }
+ ]
+ },
+ "batches": [
+ {
+ "count": 5,
+ "columns": [
+ {
+ "name": "foo",
+ "count": 5,
+ "DATA": [1, 2, 3, 4, 5],
+ "VALIDITY": [1, 0, 1, 1, 1]
+ },
+ {
+ "name": "bar",
+ "count": 5,
+ "DATA": [1.0, 2.0, 3.0, 4.0, 5.0],
+ "VALIDITY": [1, 0, 0, 1, 1]
+ }
+ ]
+ }
+ ]
+}
+)example";
+
+ auto buffer = std::make_shared<Buffer>(
+ reinterpret_cast<const uint8_t*>(example), strlen(example));
+
+ std::unique_ptr<JsonReader> reader;
+ ASSERT_OK(JsonReader::Open(buffer, &reader));
+
+ Schema ex_schema({field("foo", int32()), field("bar", float64())});
+
+ ASSERT_TRUE(reader->schema()->Equals(ex_schema));
+ ASSERT_EQ(1, reader->num_record_batches());
+
+ std::shared_ptr<RecordBatch> batch;
+ ASSERT_OK(reader->GetRecordBatch(0, &batch));
+
+ std::vector<bool> foo_valid = {true, false, true, true, true};
+ std::vector<int32_t> foo_values = {1, 2, 3, 4, 5};
+ std::shared_ptr<Array> foo;
+ MakeArray<Int32Type, int32_t>(int32(), foo_valid, foo_values, &foo);
+ ASSERT_TRUE(batch->column(0)->Equals(foo));
+
+ std::vector<bool> bar_valid = {true, false, false, true, true};
+ std::vector<double> bar_values = {1, 2, 3, 4, 5};
+ std::shared_ptr<Array> bar;
+ MakeArray<DoubleType, double>(float64(), bar_valid, bar_values, &bar);
+ ASSERT_TRUE(batch->column(1)->Equals(bar));
+}
+
+} // namespace ipc
+} // namespace arrow
http://git-wip-us.apache.org/repos/asf/arrow/blob/ed6ec3b7/cpp/src/arrow/ipc/json-internal.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/json-internal.cc b/cpp/src/arrow/ipc/json-internal.cc
new file mode 100644
index 0000000..31fe35b
--- /dev/null
+++ b/cpp/src/arrow/ipc/json-internal.cc
@@ -0,0 +1,1113 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/ipc/json-internal.h"
+
+#include <cstdint>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include "rapidjson/stringbuffer.h"
+#include "rapidjson/writer.h"
+
+#include "arrow/array.h"
+#include "arrow/schema.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/types/list.h"
+#include "arrow/types/primitive.h"
+#include "arrow/types/string.h"
+#include "arrow/types/struct.h"
+#include "arrow/util/bit-util.h"
+#include "arrow/util/memory-pool.h"
+#include "arrow/util/status.h"
+
+namespace arrow {
+namespace ipc {
+
+using RjArray = rj::Value::ConstArray;
+using RjObject = rj::Value::ConstObject;
+
+enum class BufferType : char { DATA, OFFSET, TYPE, VALIDITY };
+
+static std::string GetBufferTypeName(BufferType type) {
+ switch (type) {
+ case BufferType::DATA:
+ return "DATA";
+ case BufferType::OFFSET:
+ return "OFFSET";
+ case BufferType::TYPE:
+ return "TYPE";
+ case BufferType::VALIDITY:
+ return "VALIDITY";
+ default:
+ break;
+ }
+ return "UNKNOWN";
+}
+
+static std::string GetFloatingPrecisionName(FloatingPointMeta::Precision precision) {
+ switch (precision) {
+ case FloatingPointMeta::HALF:
+ return "HALF";
+ case FloatingPointMeta::SINGLE:
+ return "SINGLE";
+ case FloatingPointMeta::DOUBLE:
+ return "DOUBLE";
+ default:
+ break;
+ }
+ return "UNKNOWN";
+}
+
+static std::string GetTimeUnitName(TimeUnit unit) {
+ switch (unit) {
+ case TimeUnit::SECOND:
+ return "SECOND";
+ case TimeUnit::MILLI:
+ return "MILLISECOND";
+ case TimeUnit::MICRO:
+ return "MICROSECOND";
+ case TimeUnit::NANO:
+ return "NANOSECOND";
+ default:
+ break;
+ }
+ return "UNKNOWN";
+}
+
+class BufferLayout {
+ public:
+ BufferLayout(BufferType type, int bit_width) : type_(type), bit_width_(bit_width) {}
+
+ BufferType type() const { return type_; }
+ int bit_width() const { return bit_width_; }
+
+ private:
+ BufferType type_;
+ int bit_width_;
+};
+
+static const BufferLayout kValidityBuffer(BufferType::VALIDITY, 1);
+static const BufferLayout kOffsetBuffer(BufferType::OFFSET, 32);
+static const BufferLayout kTypeBuffer(BufferType::TYPE, 32);
+static const BufferLayout kBooleanBuffer(BufferType::DATA, 1);
+static const BufferLayout kValues64(BufferType::DATA, 64);
+static const BufferLayout kValues32(BufferType::DATA, 32);
+static const BufferLayout kValues16(BufferType::DATA, 16);
+static const BufferLayout kValues8(BufferType::DATA, 8);
+
+class JsonSchemaWriter : public TypeVisitor {
+ public:
+ explicit JsonSchemaWriter(const Schema& schema, RjWriter* writer)
+ : schema_(schema), writer_(writer) {}
+
+ Status Write() {
+ writer_->StartObject();
+ writer_->Key("fields");
+ writer_->StartArray();
+ for (const std::shared_ptr<Field>& field : schema_.fields()) {
+ RETURN_NOT_OK(VisitField(*field.get()));
+ }
+ writer_->EndArray();
+ writer_->EndObject();
+ return Status::OK();
+ }
+
+ Status VisitField(const Field& field) {
+ writer_->StartObject();
+
+ writer_->Key("name");
+ writer_->String(field.name.c_str());
+
+ writer_->Key("nullable");
+ writer_->Bool(field.nullable);
+
+ // Visit the type
+ RETURN_NOT_OK(field.type->Accept(this));
+ writer_->EndObject();
+
+ return Status::OK();
+ }
+
+ void SetNoChildren() {
+ writer_->Key("children");
+ writer_->StartArray();
+ writer_->EndArray();
+ }
+
+ template <typename T>
+ typename std::enable_if<std::is_base_of<NoExtraMeta, T>::value ||
+ std::is_base_of<BooleanType, T>::value ||
+ std::is_base_of<NullType, T>::value,
+ void>::type
+ WriteTypeMetadata(const T& type) {}
+
+ template <typename T>
+ typename std::enable_if<std::is_base_of<IntegerMeta, T>::value, void>::type
+ WriteTypeMetadata(const T& type) {
+ writer_->Key("bitWidth");
+ writer_->Int(type.bit_width());
+ writer_->Key("isSigned");
+ writer_->Bool(type.is_signed());
+ }
+
+ template <typename T>
+ typename std::enable_if<std::is_base_of<FloatingPointMeta, T>::value, void>::type
+ WriteTypeMetadata(const T& type) {
+ writer_->Key("precision");
+ writer_->String(GetFloatingPrecisionName(type.precision()));
+ }
+
+ template <typename T>
+ typename std::enable_if<std::is_base_of<IntervalType, T>::value, void>::type
+ WriteTypeMetadata(const T& type) {
+ writer_->Key("unit");
+ switch (type.unit) {
+ case IntervalType::Unit::YEAR_MONTH:
+ writer_->String("YEAR_MONTH");
+ break;
+ case IntervalType::Unit::DAY_TIME:
+ writer_->String("DAY_TIME");
+ break;
+ }
+ }
+
+ template <typename T>
+ typename std::enable_if<std::is_base_of<TimeType, T>::value ||
+ std::is_base_of<TimestampType, T>::value,
+ void>::type
+ WriteTypeMetadata(const T& type) {
+ writer_->Key("unit");
+ writer_->String(GetTimeUnitName(type.unit));
+ }
+
+ template <typename T>
+ typename std::enable_if<std::is_base_of<DecimalType, T>::value, void>::type
+ WriteTypeMetadata(const T& type) {
+ writer_->Key("precision");
+ writer_->Int(type.precision);
+ writer_->Key("scale");
+ writer_->Int(type.scale);
+ }
+
+ template <typename T>
+ typename std::enable_if<std::is_base_of<UnionType, T>::value, void>::type
+ WriteTypeMetadata(const T& type) {
+ writer_->Key("mode");
+ switch (type.mode) {
+ case UnionMode::SPARSE:
+ writer_->String("SPARSE");
+ break;
+ case UnionMode::DENSE:
+ writer_->String("DENSE");
+ break;
+ }
+
+ // Write type ids
+ writer_->Key("typeIds");
+ writer_->StartArray();
+ for (size_t i = 0; i < type.type_ids.size(); ++i) {
+ writer_->Uint(type.type_ids[i]);
+ }
+ writer_->EndArray();
+ }
+
+ // TODO(wesm): Other Type metadata
+
+ template <typename T>
+ void WriteName(const std::string& typeclass, const T& type) {
+ writer_->Key("type");
+ writer_->StartObject();
+ writer_->Key("name");
+ writer_->String(typeclass);
+ WriteTypeMetadata(type);
+ writer_->EndObject();
+ }
+
+ template <typename T>
+ Status WritePrimitive(const std::string& typeclass, const T& type,
+ const std::vector<BufferLayout>& buffer_layout) {
+ WriteName(typeclass, type);
+ SetNoChildren();
+ WriteBufferLayout(buffer_layout);
+ return Status::OK();
+ }
+
+ template <typename T>
+ Status WriteVarBytes(const std::string& typeclass, const T& type) {
+ WriteName(typeclass, type);
+ SetNoChildren();
+ WriteBufferLayout({kValidityBuffer, kOffsetBuffer, kValues8});
+ return Status::OK();
+ }
+
+ void WriteBufferLayout(const std::vector<BufferLayout>& buffer_layout) {
+ writer_->Key("typeLayout");
+ writer_->StartArray();
+
+ for (const BufferLayout& buffer : buffer_layout) {
+ writer_->StartObject();
+ writer_->Key("type");
+ writer_->String(GetBufferTypeName(buffer.type()));
+
+ writer_->Key("typeBitWidth");
+ writer_->Int(buffer.bit_width());
+
+ writer_->EndObject();
+ }
+ writer_->EndArray();
+ }
+
+ Status WriteChildren(const std::vector<std::shared_ptr<Field>>& children) {
+ writer_->Key("children");
+ writer_->StartArray();
+ for (const std::shared_ptr<Field>& field : children) {
+ RETURN_NOT_OK(VisitField(*field.get()));
+ }
+ writer_->EndArray();
+ return Status::OK();
+ }
+
+ Status Visit(const NullType& type) override { return WritePrimitive("null", type, {}); }
+
+ Status Visit(const BooleanType& type) override {
+ return WritePrimitive("bool", type, {kValidityBuffer, kBooleanBuffer});
+ }
+
+ Status Visit(const Int8Type& type) override {
+ return WritePrimitive("int", type, {kValidityBuffer, kValues8});
+ }
+
+ Status Visit(const Int16Type& type) override {
+ return WritePrimitive("int", type, {kValidityBuffer, kValues16});
+ }
+
+ Status Visit(const Int32Type& type) override {
+ return WritePrimitive("int", type, {kValidityBuffer, kValues32});
+ }
+
+ Status Visit(const Int64Type& type) override {
+ return WritePrimitive("int", type, {kValidityBuffer, kValues64});
+ }
+
+ Status Visit(const UInt8Type& type) override {
+ return WritePrimitive("int", type, {kValidityBuffer, kValues8});
+ }
+
+ Status Visit(const UInt16Type& type) override {
+ return WritePrimitive("int", type, {kValidityBuffer, kValues16});
+ }
+
+ Status Visit(const UInt32Type& type) override {
+ return WritePrimitive("int", type, {kValidityBuffer, kValues32});
+ }
+
+ Status Visit(const UInt64Type& type) override {
+ return WritePrimitive("int", type, {kValidityBuffer, kValues64});
+ }
+
+ Status Visit(const HalfFloatType& type) override {
+ return WritePrimitive("floatingpoint", type, {kValidityBuffer, kValues16});
+ }
+
+ Status Visit(const FloatType& type) override {
+ return WritePrimitive("floatingpoint", type, {kValidityBuffer, kValues32});
+ }
+
+ Status Visit(const DoubleType& type) override {
+ return WritePrimitive("floatingpoint", type, {kValidityBuffer, kValues64});
+ }
+
+ Status Visit(const StringType& type) override { return WriteVarBytes("utf8", type); }
+
+ Status Visit(const BinaryType& type) override { return WriteVarBytes("binary", type); }
+
+ Status Visit(const DateType& type) override {
+ return WritePrimitive("date", type, {kValidityBuffer, kValues64});
+ }
+
+ Status Visit(const TimeType& type) override {
+ return WritePrimitive("time", type, {kValidityBuffer, kValues64});
+ }
+
+ Status Visit(const TimestampType& type) override {
+ return WritePrimitive("timestamp", type, {kValidityBuffer, kValues64});
+ }
+
+ Status Visit(const IntervalType& type) override {
+ return WritePrimitive("interval", type, {kValidityBuffer, kValues64});
+ }
+
+ Status Visit(const DecimalType& type) override { return Status::NotImplemented("NYI"); }
+
+ Status Visit(const ListType& type) override {
+ WriteName("list", type);
+ RETURN_NOT_OK(WriteChildren(type.children()));
+ WriteBufferLayout({kValidityBuffer, kOffsetBuffer});
+ return Status::OK();
+ }
+
+ Status Visit(const StructType& type) override {
+ WriteName("struct", type);
+ WriteChildren(type.children());
+ WriteBufferLayout({kValidityBuffer, kTypeBuffer});
+ return Status::OK();
+ }
+
+ Status Visit(const UnionType& type) override {
+ WriteName("union", type);
+ WriteChildren(type.children());
+
+ if (type.mode == UnionMode::SPARSE) {
+ WriteBufferLayout({kValidityBuffer, kTypeBuffer});
+ } else {
+ WriteBufferLayout({kValidityBuffer, kTypeBuffer, kOffsetBuffer});
+ }
+ return Status::OK();
+ }
+
+ private:
+ const Schema& schema_;
+ RjWriter* writer_;
+};
+
+class JsonArrayWriter : public ArrayVisitor {
+ public:
+ explicit JsonArrayWriter(const std::string& name, const Array& array, RjWriter* writer)
+ : name_(name), array_(array), writer_(writer) {}
+
+ Status Write() { return VisitArray(name_, array_); }
+
+ Status VisitArray(const std::string& name, const Array& arr) {
+ writer_->StartObject();
+ writer_->Key("name");
+ writer_->String(name);
+
+ writer_->Key("count");
+ writer_->Int(arr.length());
+
+ RETURN_NOT_OK(arr.Accept(this));
+
+ writer_->EndObject();
+ return Status::OK();
+ }
+
+ template <typename T>
+ typename std::enable_if<IsSignedInt<T>::value, void>::type WriteDataValues(
+ const T& arr) {
+ const auto data = arr.raw_data();
+ for (int i = 0; i < arr.length(); ++i) {
+ writer_->Int64(data[i]);
+ }
+ }
+
+ template <typename T>
+ typename std::enable_if<IsUnsignedInt<T>::value, void>::type WriteDataValues(
+ const T& arr) {
+ const auto data = arr.raw_data();
+ for (int i = 0; i < arr.length(); ++i) {
+ writer_->Uint64(data[i]);
+ }
+ }
+
+ template <typename T>
+ typename std::enable_if<IsFloatingPoint<T>::value, void>::type WriteDataValues(
+ const T& arr) {
+ const auto data = arr.raw_data();
+ for (int i = 0; i < arr.length(); ++i) {
+ writer_->Double(data[i]);
+ }
+ }
+
+ // String (Utf8), Binary
+ template <typename T>
+ typename std::enable_if<std::is_base_of<BinaryArray, T>::value, void>::type
+ WriteDataValues(const T& arr) {
+ for (int i = 0; i < arr.length(); ++i) {
+ int32_t length;
+ const char* buf = reinterpret_cast<const char*>(arr.GetValue(i, &length));
+ writer_->String(buf, length);
+ }
+ }
+
+ template <typename T>
+ typename std::enable_if<std::is_base_of<BooleanArray, T>::value, void>::type
+ WriteDataValues(const T& arr) {
+ for (int i = 0; i < arr.length(); ++i) {
+ writer_->Bool(arr.Value(i));
+ }
+ }
+
+ template <typename T>
+ void WriteDataField(const T& arr) {
+ writer_->Key("DATA");
+ writer_->StartArray();
+ WriteDataValues(arr);
+ writer_->EndArray();
+ }
+
+ template <typename T>
+ void WriteOffsetsField(const T* offsets, int32_t length) {
+ writer_->Key("OFFSETS");
+ writer_->StartArray();
+ for (int i = 0; i < length; ++i) {
+ writer_->Int64(offsets[i]);
+ }
+ writer_->EndArray();
+ }
+
+ void WriteValidityField(const Array& arr) {
+ writer_->Key("VALIDITY");
+ writer_->StartArray();
+ if (arr.null_count() > 0) {
+ for (int i = 0; i < arr.length(); ++i) {
+ writer_->Int(arr.IsNull(i) ? 0 : 1);
+ }
+ } else {
+ for (int i = 0; i < arr.length(); ++i) {
+ writer_->Int(1);
+ }
+ }
+ writer_->EndArray();
+ }
+
+ void SetNoChildren() {
+ writer_->Key("children");
+ writer_->StartArray();
+ writer_->EndArray();
+ }
+
+ template <typename T>
+ Status WritePrimitive(const T& array) {
+ WriteValidityField(array);
+ WriteDataField(array);
+ SetNoChildren();
+ return Status::OK();
+ }
+
+ template <typename T>
+ Status WriteVarBytes(const T& array) {
+ WriteValidityField(array);
+ WriteOffsetsField(array.raw_offsets(), array.length() + 1);
+ WriteDataField(array);
+ SetNoChildren();
+ return Status::OK();
+ }
+
+ Status WriteChildren(const std::vector<std::shared_ptr<Field>>& fields,
+ const std::vector<std::shared_ptr<Array>>& arrays) {
+ writer_->Key("children");
+ writer_->StartArray();
+ for (size_t i = 0; i < fields.size(); ++i) {
+ RETURN_NOT_OK(VisitArray(fields[i]->name, *arrays[i].get()));
+ }
+ writer_->EndArray();
+ return Status::OK();
+ }
+
+ Status Visit(const NullArray& array) override {
+ SetNoChildren();
+ return Status::OK();
+ }
+
+ Status Visit(const BooleanArray& array) override { return WritePrimitive(array); }
+
+ Status Visit(const Int8Array& array) override { return WritePrimitive(array); }
+
+ Status Visit(const Int16Array& array) override { return WritePrimitive(array); }
+
+ Status Visit(const Int32Array& array) override { return WritePrimitive(array); }
+
+ Status Visit(const Int64Array& array) override { return WritePrimitive(array); }
+
+ Status Visit(const UInt8Array& array) override { return WritePrimitive(array); }
+
+ Status Visit(const UInt16Array& array) override { return WritePrimitive(array); }
+
+ Status Visit(const UInt32Array& array) override { return WritePrimitive(array); }
+
+ Status Visit(const UInt64Array& array) override { return WritePrimitive(array); }
+
+ Status Visit(const HalfFloatArray& array) override { return WritePrimitive(array); }
+
+ Status Visit(const FloatArray& array) override { return WritePrimitive(array); }
+
+ Status Visit(const DoubleArray& array) override { return WritePrimitive(array); }
+
+ Status Visit(const StringArray& array) override { return WriteVarBytes(array); }
+
+ Status Visit(const BinaryArray& array) override { return WriteVarBytes(array); }
+
+ Status Visit(const DateArray& array) override { return Status::NotImplemented("date"); }
+
+ Status Visit(const TimeArray& array) override { return Status::NotImplemented("time"); }
+
+ Status Visit(const TimestampArray& array) override {
+ return Status::NotImplemented("timestamp");
+ }
+
+ Status Visit(const IntervalArray& array) override {
+ return Status::NotImplemented("interval");
+ }
+
+ Status Visit(const DecimalArray& array) override {
+ return Status::NotImplemented("decimal");
+ }
+
+ Status Visit(const ListArray& array) override {
+ WriteValidityField(array);
+ WriteOffsetsField(array.raw_offsets(), array.length() + 1);
+ auto type = static_cast<const ListType*>(array.type().get());
+ return WriteChildren(type->children(), {array.values()});
+ }
+
+ Status Visit(const StructArray& array) override {
+ WriteValidityField(array);
+ auto type = static_cast<const StructType*>(array.type().get());
+ return WriteChildren(type->children(), array.fields());
+ }
+
+ Status Visit(const UnionArray& array) override {
+ return Status::NotImplemented("union");
+ }
+
+ private:
+ const std::string& name_;
+ const Array& array_;
+ RjWriter* writer_;
+};
+
+class JsonSchemaReader {
+ public:
+ explicit JsonSchemaReader(const rj::Value& json_schema) : json_schema_(json_schema) {}
+
+ Status GetSchema(std::shared_ptr<Schema>* schema) {
+ const auto& obj_schema = json_schema_.GetObject();
+
+ const auto& json_fields = obj_schema.FindMember("fields");
+ RETURN_NOT_ARRAY("fields", json_fields, obj_schema);
+
+ std::vector<std::shared_ptr<Field>> fields;
+ RETURN_NOT_OK(GetFieldsFromArray(json_fields->value, &fields));
+
+ *schema = std::make_shared<Schema>(fields);
+ return Status::OK();
+ }
+
+ Status GetFieldsFromArray(
+ const rj::Value& obj, std::vector<std::shared_ptr<Field>>* fields) {
+ const auto& values = obj.GetArray();
+
+ fields->resize(values.Size());
+ for (size_t i = 0; i < fields->size(); ++i) {
+ RETURN_NOT_OK(GetField(values[i], &(*fields)[i]));
+ }
+ return Status::OK();
+ }
+
+ Status GetField(const rj::Value& obj, std::shared_ptr<Field>* field) {
+ if (!obj.IsObject()) { return Status::Invalid("Field was not a JSON object"); }
+ const auto& json_field = obj.GetObject();
+
+ const auto& json_name = json_field.FindMember("name");
+ RETURN_NOT_STRING("name", json_name, json_field);
+
+ const auto& json_nullable = json_field.FindMember("nullable");
+ RETURN_NOT_BOOL("nullable", json_nullable, json_field);
+
+ const auto& json_type = json_field.FindMember("type");
+ RETURN_NOT_OBJECT("type", json_type, json_field);
+
+ const auto& json_children = json_field.FindMember("children");
+ RETURN_NOT_ARRAY("children", json_children, json_field);
+
+ std::vector<std::shared_ptr<Field>> children;
+ RETURN_NOT_OK(GetFieldsFromArray(json_children->value, &children));
+
+ std::shared_ptr<DataType> type;
+ RETURN_NOT_OK(GetType(json_type->value.GetObject(), children, &type));
+
+ *field = std::make_shared<Field>(
+ json_name->value.GetString(), type, json_nullable->value.GetBool());
+ return Status::OK();
+ }
+
+ Status GetInteger(
+ const rj::Value::ConstObject& json_type, std::shared_ptr<DataType>* type) {
+ const auto& json_bit_width = json_type.FindMember("bitWidth");
+ RETURN_NOT_INT("bitWidth", json_bit_width, json_type);
+
+ const auto& json_is_signed = json_type.FindMember("isSigned");
+ RETURN_NOT_BOOL("isSigned", json_is_signed, json_type);
+
+ bool is_signed = json_is_signed->value.GetBool();
+ int bit_width = json_bit_width->value.GetInt();
+
+ switch (bit_width) {
+ case 8:
+ *type = is_signed ? int8() : uint8();
+ break;
+ case 16:
+ *type = is_signed ? int16() : uint16();
+ break;
+ case 32:
+ *type = is_signed ? int32() : uint32();
+ break;
+ case 64:
+ *type = is_signed ? int64() : uint64();
+ break;
+ default:
+ std::stringstream ss;
+ ss << "Invalid bit width: " << bit_width;
+ return Status::Invalid(ss.str());
+ }
+ return Status::OK();
+ }
+
+ Status GetFloatingPoint(const RjObject& json_type, std::shared_ptr<DataType>* type) {
+ const auto& json_precision = json_type.FindMember("precision");
+ RETURN_NOT_STRING("precision", json_precision, json_type);
+
+ std::string precision = json_precision->value.GetString();
+
+ if (precision == "DOUBLE") {
+ *type = float64();
+ } else if (precision == "SINGLE") {
+ *type = float32();
+ } else if (precision == "HALF") {
+ *type = float16();
+ } else {
+ std::stringstream ss;
+ ss << "Invalid precision: " << precision;
+ return Status::Invalid(ss.str());
+ }
+ return Status::OK();
+ }
+
+ template <typename T>
+ Status GetTimeLike(const RjObject& json_type, std::shared_ptr<DataType>* type) {
+ const auto& json_unit = json_type.FindMember("unit");
+ RETURN_NOT_STRING("unit", json_unit, json_type);
+
+ std::string unit_str = json_unit->value.GetString();
+
+ TimeUnit unit;
+
+ if (unit_str == "SECOND") {
+ unit = TimeUnit::SECOND;
+ } else if (unit_str == "MILLISECOND") {
+ unit = TimeUnit::MILLI;
+ } else if (unit_str == "MICROSECOND") {
+ unit = TimeUnit::MICRO;
+ } else if (unit_str == "NANOSECOND") {
+ unit = TimeUnit::NANO;
+ } else {
+ std::stringstream ss;
+ ss << "Invalid time unit: " << unit_str;
+ return Status::Invalid(ss.str());
+ }
+
+ *type = std::make_shared<T>(unit);
+
+ return Status::OK();
+ }
+
+ Status GetUnion(const RjObject& json_type,
+ const std::vector<std::shared_ptr<Field>>& children,
+ std::shared_ptr<DataType>* type) {
+ const auto& json_mode = json_type.FindMember("mode");
+ RETURN_NOT_STRING("mode", json_mode, json_type);
+
+ std::string mode_str = json_mode->value.GetString();
+ UnionMode mode;
+
+ if (mode_str == "SPARSE") {
+ mode = UnionMode::SPARSE;
+ } else if (mode_str == "DENSE") {
+ mode = UnionMode::DENSE;
+ } else {
+ std::stringstream ss;
+ ss << "Invalid union mode: " << mode_str;
+ return Status::Invalid(ss.str());
+ }
+
+ const auto& json_type_ids = json_type.FindMember("typeIds");
+ RETURN_NOT_ARRAY("typeIds", json_type_ids, json_type);
+
+ std::vector<uint8_t> type_ids;
+ const auto& id_array = json_type_ids->value.GetArray();
+ for (const rj::Value& val : id_array) {
+ DCHECK(val.IsUint());
+ type_ids.push_back(val.GetUint());
+ }
+
+ *type = union_(children, type_ids, mode);
+
+ return Status::OK();
+ }
+
+ Status GetType(const RjObject& json_type,
+ const std::vector<std::shared_ptr<Field>>& children,
+ std::shared_ptr<DataType>* type) {
+ const auto& json_type_name = json_type.FindMember("name");
+ RETURN_NOT_STRING("name", json_type_name, json_type);
+
+ std::string type_name = json_type_name->value.GetString();
+
+ if (type_name == "int") {
+ return GetInteger(json_type, type);
+ } else if (type_name == "floatingpoint") {
+ return GetFloatingPoint(json_type, type);
+ } else if (type_name == "bool") {
+ *type = boolean();
+ } else if (type_name == "utf8") {
+ *type = utf8();
+ } else if (type_name == "binary") {
+ *type = binary();
+ } else if (type_name == "null") {
+ *type = null();
+ } else if (type_name == "date") {
+ *type = date();
+ } else if (type_name == "time") {
+ return GetTimeLike<TimeType>(json_type, type);
+ } else if (type_name == "timestamp") {
+ return GetTimeLike<TimestampType>(json_type, type);
+ } else if (type_name == "list") {
+ *type = list(children[0]);
+ } else if (type_name == "struct") {
+ *type = struct_(children);
+ } else {
+ return GetUnion(json_type, children, type);
+ }
+ return Status::OK();
+ }
+
+ private:
+ const rj::Value& json_schema_;
+};
+
+class JsonArrayReader {
+ public:
+ explicit JsonArrayReader(MemoryPool* pool) : pool_(pool) {}
+
+ Status GetValidityBuffer(const std::vector<bool>& is_valid, int32_t* null_count,
+ std::shared_ptr<Buffer>* validity_buffer) {
+ int length = static_cast<int>(is_valid.size());
+
+ std::shared_ptr<MutableBuffer> out_buffer;
+ RETURN_NOT_OK(GetEmptyBitmap(pool_, length, &out_buffer));
+ uint8_t* bitmap = out_buffer->mutable_data();
+
+ *null_count = 0;
+ for (int i = 0; i < length; ++i) {
+ if (!is_valid[i]) {
+ ++(*null_count);
+ continue;
+ }
+ BitUtil::SetBit(bitmap, i);
+ }
+
+ *validity_buffer = out_buffer;
+ return Status::OK();
+ }
+
+ template <typename T>
+ typename std::enable_if<std::is_base_of<PrimitiveCType, T>::value ||
+ std::is_base_of<BooleanType, T>::value,
+ Status>::type
+ ReadArray(const RjObject& json_array, int32_t length, const std::vector<bool>& is_valid,
+ const std::shared_ptr<DataType>& type, std::shared_ptr<Array>* array) {
+ typename TypeTraits<T>::BuilderType builder(pool_, type);
+
+ const auto& json_data = json_array.FindMember("DATA");
+ RETURN_NOT_ARRAY("DATA", json_data, json_array);
+
+ const auto& json_data_arr = json_data->value.GetArray();
+
+ DCHECK_EQ(static_cast<int32_t>(json_data_arr.Size()), length);
+ for (int i = 0; i < length; ++i) {
+ if (!is_valid[i]) {
+ builder.AppendNull();
+ continue;
+ }
+
+ const rj::Value& val = json_data_arr[i];
+ if (IsSignedInt<T>::value) {
+ DCHECK(val.IsInt());
+ builder.Append(val.GetInt64());
+ } else if (IsUnsignedInt<T>::value) {
+ DCHECK(val.IsUint());
+ builder.Append(val.GetUint64());
+ } else if (IsFloatingPoint<T>::value) {
+ DCHECK(val.IsFloat());
+ builder.Append(val.GetFloat());
+ } else if (std::is_base_of<BooleanType, T>::value) {
+ DCHECK(val.IsBool());
+ builder.Append(val.GetBool());
+ } else {
+ // We are in the wrong function
+ return Status::Invalid(type->ToString());
+ }
+ }
+
+ return builder.Finish(array);
+ }
+
+ template <typename T>
+ typename std::enable_if<std::is_base_of<BinaryType, T>::value, Status>::type ReadArray(
+ const RjObject& json_array, int32_t length, const std::vector<bool>& is_valid,
+ const std::shared_ptr<DataType>& type, std::shared_ptr<Array>* array) {
+ typename TypeTraits<T>::BuilderType builder(pool_, type);
+
+ const auto& json_data = json_array.FindMember("DATA");
+ RETURN_NOT_ARRAY("DATA", json_data, json_array);
+
+ const auto& json_data_arr = json_data->value.GetArray();
+
+ DCHECK_EQ(static_cast<int32_t>(json_data_arr.Size()), length);
+ for (int i = 0; i < length; ++i) {
+ if (!is_valid[i]) {
+ builder.AppendNull();
+ continue;
+ }
+
+ const rj::Value& val = json_data_arr[i];
+ DCHECK(val.IsString());
+ builder.Append(val.GetString());
+ }
+
+ return builder.Finish(array);
+ }
+
+ template <typename T>
+ typename std::enable_if<std::is_base_of<ListType, T>::value, Status>::type ReadArray(
+ const RjObject& json_array, int32_t length, const std::vector<bool>& is_valid,
+ const std::shared_ptr<DataType>& type, std::shared_ptr<Array>* array) {
+ const auto& json_offsets = json_array.FindMember("OFFSETS");
+ RETURN_NOT_ARRAY("OFFSETS", json_offsets, json_array);
+ const auto& json_offsets_arr = json_offsets->value.GetArray();
+
+ int32_t null_count = 0;
+ std::shared_ptr<Buffer> validity_buffer;
+ RETURN_NOT_OK(GetValidityBuffer(is_valid, &null_count, &validity_buffer));
+
+ auto offsets_buffer = std::make_shared<PoolBuffer>(pool_);
+ RETURN_NOT_OK(offsets_buffer->Resize((length + 1) * sizeof(int32_t)));
+ int32_t* offsets = reinterpret_cast<int32_t*>(offsets_buffer->mutable_data());
+
+ for (int i = 0; i < length + 1; ++i) {
+ const rj::Value& val = json_offsets_arr[i];
+ DCHECK(val.IsInt());
+ offsets[i] = val.GetInt();
+ }
+
+ std::vector<std::shared_ptr<Array>> children;
+ RETURN_NOT_OK(GetChildren(json_array, type, &children));
+ DCHECK_EQ(children.size(), 1);
+
+ *array = std::make_shared<ListArray>(
+ type, length, offsets_buffer, children[0], null_count, validity_buffer);
+
+ return Status::OK();
+ }
+
+ template <typename T>
+ typename std::enable_if<std::is_base_of<StructType, T>::value, Status>::type ReadArray(
+ const RjObject& json_array, int32_t length, const std::vector<bool>& is_valid,
+ const std::shared_ptr<DataType>& type, std::shared_ptr<Array>* array) {
+ int32_t null_count = 0;
+ std::shared_ptr<Buffer> validity_buffer;
+ RETURN_NOT_OK(GetValidityBuffer(is_valid, &null_count, &validity_buffer));
+
+ std::vector<std::shared_ptr<Array>> fields;
+ RETURN_NOT_OK(GetChildren(json_array, type, &fields));
+
+ *array =
+ std::make_shared<StructArray>(type, length, fields, null_count, validity_buffer);
+
+ return Status::OK();
+ }
+
+ template <typename T>
+ typename std::enable_if<std::is_base_of<NullType, T>::value, Status>::type ReadArray(
+ const RjObject& json_array, int32_t length, const std::vector<bool>& is_valid,
+ const std::shared_ptr<DataType>& type, std::shared_ptr<Array>* array) {
+ *array = std::make_shared<NullArray>(type, length);
+ return Status::OK();
+ }
+
+ Status GetChildren(const RjObject& json_array, const std::shared_ptr<DataType>& type,
+ std::vector<std::shared_ptr<Array>>* array) {
+ const auto& json_children = json_array.FindMember("children");
+ RETURN_NOT_ARRAY("children", json_children, json_array);
+ const auto& json_children_arr = json_children->value.GetArray();
+
+ if (type->num_children() != static_cast<int>(json_children_arr.Size())) {
+ std::stringstream ss;
+ ss << "Expected " << type->num_children() << " children, but got "
+ << json_children_arr.Size();
+ return Status::Invalid(ss.str());
+ }
+
+ for (int i = 0; i < static_cast<int>(json_children_arr.Size()); ++i) {
+ const rj::Value& json_child = json_children_arr[i];
+ DCHECK(json_child.IsObject());
+
+ std::shared_ptr<Field> child_field = type->child(i);
+
+ auto it = json_child.FindMember("name");
+ RETURN_NOT_STRING("name", it, json_child);
+
+ DCHECK_EQ(it->value.GetString(), child_field->name);
+ std::shared_ptr<Array> child;
+ RETURN_NOT_OK(GetArray(json_children_arr[i], child_field->type, &child));
+ array->emplace_back(child);
+ }
+
+ return Status::OK();
+ }
+
+ Status GetArray(const rj::Value& obj, const std::shared_ptr<DataType>& type,
+ std::shared_ptr<Array>* array) {
+ if (!obj.IsObject()) {
+ return Status::Invalid("Array element was not a JSON object");
+ }
+ const auto& json_array = obj.GetObject();
+
+ const auto& json_length = json_array.FindMember("count");
+ RETURN_NOT_INT("count", json_length, json_array);
+ int32_t length = json_length->value.GetInt();
+
+ const auto& json_valid_iter = json_array.FindMember("VALIDITY");
+ RETURN_NOT_ARRAY("VALIDITY", json_valid_iter, json_array);
+
+ const auto& json_validity = json_valid_iter->value.GetArray();
+
+ DCHECK_EQ(static_cast<int>(json_validity.Size()), length);
+
+ std::vector<bool> is_valid;
+ for (const rj::Value& val : json_validity) {
+ DCHECK(val.IsInt());
+ is_valid.push_back(static_cast<bool>(val.GetInt()));
+ }
+
+#define TYPE_CASE(TYPE) \
+ case TYPE::type_id: \
+ return ReadArray<TYPE>(json_array, length, is_valid, type, array);
+
+#define NOT_IMPLEMENTED_CASE(TYPE_ENUM) \
+ case Type::TYPE_ENUM: { \
+ std::stringstream ss; \
+ ss << type->ToString(); \
+ return Status::NotImplemented(ss.str()); \
+ }
+
+ switch (type->type) {
+ TYPE_CASE(NullType);
+ TYPE_CASE(BooleanType);
+ TYPE_CASE(UInt8Type);
+ TYPE_CASE(Int8Type);
+ TYPE_CASE(UInt16Type);
+ TYPE_CASE(Int16Type);
+ TYPE_CASE(UInt32Type);
+ TYPE_CASE(Int32Type);
+ TYPE_CASE(UInt64Type);
+ TYPE_CASE(Int64Type);
+ TYPE_CASE(HalfFloatType);
+ TYPE_CASE(FloatType);
+ TYPE_CASE(DoubleType);
+ TYPE_CASE(StringType);
+ TYPE_CASE(BinaryType);
+ NOT_IMPLEMENTED_CASE(DATE);
+ NOT_IMPLEMENTED_CASE(TIMESTAMP);
+ NOT_IMPLEMENTED_CASE(TIME);
+ NOT_IMPLEMENTED_CASE(INTERVAL);
+ TYPE_CASE(ListType);
+ TYPE_CASE(StructType);
+ NOT_IMPLEMENTED_CASE(UNION);
+ default:
+ std::stringstream ss;
+ ss << type->ToString();
+ return Status::NotImplemented(ss.str());
+ }
+
+#undef TYPE_CASE
+#undef NOT_IMPLEMENTED_CASE
+
+ return Status::OK();
+ }
+
+ private:
+ MemoryPool* pool_;
+};
+
+Status WriteJsonSchema(const Schema& schema, RjWriter* json_writer) {
+ JsonSchemaWriter converter(schema, json_writer);
+ return converter.Write();
+}
+
+Status ReadJsonSchema(const rj::Value& json_schema, std::shared_ptr<Schema>* schema) {
+ JsonSchemaReader converter(json_schema);
+ return converter.GetSchema(schema);
+}
+
+Status WriteJsonArray(
+ const std::string& name, const Array& array, RjWriter* json_writer) {
+ JsonArrayWriter converter(name, array, json_writer);
+ return converter.Write();
+}
+
+Status ReadJsonArray(MemoryPool* pool, const rj::Value& json_array,
+ const std::shared_ptr<DataType>& type, std::shared_ptr<Array>* array) {
+ JsonArrayReader converter(pool);
+ return converter.GetArray(json_array, type, array);
+}
+
+Status ReadJsonArray(MemoryPool* pool, const rj::Value& json_array, const Schema& schema,
+ std::shared_ptr<Array>* array) {
+ if (!json_array.IsObject()) { return Status::Invalid("Element was not a JSON object"); }
+
+ const auto& json_obj = json_array.GetObject();
+
+ const auto& json_name = json_obj.FindMember("name");
+ RETURN_NOT_STRING("name", json_name, json_obj);
+
+ std::string name = json_name->value.GetString();
+
+ std::shared_ptr<Field> result = nullptr;
+ for (const std::shared_ptr<Field>& field : schema.fields()) {
+ if (field->name == name) {
+ result = field;
+ break;
+ }
+ }
+
+ if (result == nullptr) {
+ std::stringstream ss;
+ ss << "Field named " << name << " not found in schema";
+ return Status::KeyError(ss.str());
+ }
+
+ return ReadJsonArray(pool, json_array, result->type, array);
+}
+
+} // namespace ipc
+} // namespace arrow
http://git-wip-us.apache.org/repos/asf/arrow/blob/ed6ec3b7/cpp/src/arrow/ipc/json-internal.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/json-internal.h b/cpp/src/arrow/ipc/json-internal.h
new file mode 100644
index 0000000..0c167a4
--- /dev/null
+++ b/cpp/src/arrow/ipc/json-internal.h
@@ -0,0 +1,111 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef ARROW_IPC_JSON_INTERNAL_H
+#define ARROW_IPC_JSON_INTERNAL_H
+
+#define RAPIDJSON_HAS_STDSTRING 1
+#define RAPIDJSON_HAS_CXX11_RVALUE_REFS 1
+#define RAPIDJSON_HAS_CXX11_RANGE_FOR 1
+
+#include <memory>
+#include <sstream>
+#include <string>
+
+#include "rapidjson/document.h"
+#include "rapidjson/stringbuffer.h"
+#include "rapidjson/writer.h"
+
+#include "arrow/type_fwd.h"
+#include "arrow/util/visibility.h"
+
+namespace rj = rapidjson;
+using RjWriter = rj::Writer<rj::StringBuffer>;
+
+#define RETURN_NOT_FOUND(TOK, NAME, PARENT) \
+ if (NAME == PARENT.MemberEnd()) { \
+ std::stringstream ss; \
+ ss << "field " << TOK << " not found"; \
+ return Status::Invalid(ss.str()); \
+ }
+
+#define RETURN_NOT_STRING(TOK, NAME, PARENT) \
+ RETURN_NOT_FOUND(TOK, NAME, PARENT); \
+ if (!NAME->value.IsString()) { \
+ std::stringstream ss; \
+ ss << "field was not a string" \
+ << " line " << __LINE__; \
+ return Status::Invalid(ss.str()); \
+ }
+
+#define RETURN_NOT_BOOL(TOK, NAME, PARENT) \
+ RETURN_NOT_FOUND(TOK, NAME, PARENT); \
+ if (!NAME->value.IsBool()) { \
+ std::stringstream ss; \
+ ss << "field was not a boolean" \
+ << " line " << __LINE__; \
+ return Status::Invalid(ss.str()); \
+ }
+
+#define RETURN_NOT_INT(TOK, NAME, PARENT) \
+ RETURN_NOT_FOUND(TOK, NAME, PARENT); \
+ if (!NAME->value.IsInt()) { \
+ std::stringstream ss; \
+ ss << "field was not an int" \
+ << " line " << __LINE__; \
+ return Status::Invalid(ss.str()); \
+ }
+
+#define RETURN_NOT_ARRAY(TOK, NAME, PARENT) \
+ RETURN_NOT_FOUND(TOK, NAME, PARENT); \
+ if (!NAME->value.IsArray()) { \
+ std::stringstream ss; \
+ ss << "field was not an array" \
+ << " line " << __LINE__; \
+ return Status::Invalid(ss.str()); \
+ }
+
+#define RETURN_NOT_OBJECT(TOK, NAME, PARENT) \
+ RETURN_NOT_FOUND(TOK, NAME, PARENT); \
+ if (!NAME->value.IsObject()) { \
+ std::stringstream ss; \
+ ss << "field was not an object" \
+ << " line " << __LINE__; \
+ return Status::Invalid(ss.str()); \
+ }
+
+namespace arrow {
+namespace ipc {
+
+// TODO(wesm): Only exporting these because arrow_ipc does not have a static
+// library at the moment. Better to not export
+Status ARROW_EXPORT WriteJsonSchema(const Schema& schema, RjWriter* json_writer);
+Status ARROW_EXPORT WriteJsonArray(
+ const std::string& name, const Array& array, RjWriter* json_writer);
+
+Status ARROW_EXPORT ReadJsonSchema(
+ const rj::Value& json_obj, std::shared_ptr<Schema>* schema);
+Status ARROW_EXPORT ReadJsonArray(MemoryPool* pool, const rj::Value& json_obj,
+ const std::shared_ptr<DataType>& type, std::shared_ptr<Array>* array);
+
+Status ARROW_EXPORT ReadJsonArray(MemoryPool* pool, const rj::Value& json_obj,
+ const Schema& schema, std::shared_ptr<Array>* array);
+
+} // namespace ipc
+} // namespace arrow
+
+#endif // ARROW_IPC_JSON_INTERNAL_H
http://git-wip-us.apache.org/repos/asf/arrow/blob/ed6ec3b7/cpp/src/arrow/ipc/json.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/json.cc b/cpp/src/arrow/ipc/json.cc
new file mode 100644
index 0000000..2281611
--- /dev/null
+++ b/cpp/src/arrow/ipc/json.cc
@@ -0,0 +1,219 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/ipc/json.h"
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/ipc/json-internal.h"
+#include "arrow/schema.h"
+#include "arrow/table.h"
+#include "arrow/type.h"
+#include "arrow/util/buffer.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/memory-pool.h"
+#include "arrow/util/status.h"
+
+namespace arrow {
+namespace ipc {
+
+// ----------------------------------------------------------------------
+// Writer implementation
+
+class JsonWriter::JsonWriterImpl {
+ public:
+ explicit JsonWriterImpl(const std::shared_ptr<Schema>& schema) : schema_(schema) {
+ writer_.reset(new RjWriter(string_buffer_));
+ }
+
+ Status Start() {
+ writer_->StartObject();
+
+ writer_->Key("schema");
+ RETURN_NOT_OK(WriteJsonSchema(*schema_.get(), writer_.get()));
+
+ // Record batches
+ writer_->Key("batches");
+ writer_->StartArray();
+ return Status::OK();
+ }
+
+ Status Finish(std::string* result) {
+ writer_->EndArray(); // Record batches
+ writer_->EndObject();
+
+ *result = string_buffer_.GetString();
+ return Status::OK();
+ }
+
+ Status WriteRecordBatch(
+ const std::vector<std::shared_ptr<Array>>& columns, int32_t num_rows) {
+ DCHECK_EQ(static_cast<int>(columns.size()), schema_->num_fields());
+
+ writer_->StartObject();
+ writer_->Key("count");
+ writer_->Int(num_rows);
+
+ writer_->Key("columns");
+ writer_->StartArray();
+
+ for (int i = 0; i < schema_->num_fields(); ++i) {
+ const std::shared_ptr<Array>& column = columns[i];
+
+ DCHECK_EQ(num_rows, column->length())
+ << "Array length did not match record batch length";
+
+ RETURN_NOT_OK(
+ WriteJsonArray(schema_->field(i)->name, *column.get(), writer_.get()));
+ }
+
+ writer_->EndArray();
+ writer_->EndObject();
+ return Status::OK();
+ }
+
+ private:
+ std::shared_ptr<Schema> schema_;
+
+ rj::StringBuffer string_buffer_;
+ std::unique_ptr<RjWriter> writer_;
+};
+
+JsonWriter::JsonWriter(const std::shared_ptr<Schema>& schema) {
+ impl_.reset(new JsonWriterImpl(schema));
+}
+
+JsonWriter::~JsonWriter() {}
+
+Status JsonWriter::Open(
+ const std::shared_ptr<Schema>& schema, std::unique_ptr<JsonWriter>* writer) {
+ *writer = std::unique_ptr<JsonWriter>(new JsonWriter(schema));
+ return (*writer)->impl_->Start();
+}
+
+Status JsonWriter::Finish(std::string* result) {
+ return impl_->Finish(result);
+}
+
+Status JsonWriter::WriteRecordBatch(
+ const std::vector<std::shared_ptr<Array>>& columns, int32_t num_rows) {
+ return impl_->WriteRecordBatch(columns, num_rows);
+}
+
+// ----------------------------------------------------------------------
+// Reader implementation
+
+class JsonReader::JsonReaderImpl {
+ public:
+ JsonReaderImpl(MemoryPool* pool, const std::shared_ptr<Buffer>& data)
+ : pool_(pool), data_(data) {}
+
+ Status ParseAndReadSchema() {
+ doc_.Parse(reinterpret_cast<const rj::Document::Ch*>(data_->data()),
+ static_cast<size_t>(data_->size()));
+ if (doc_.HasParseError()) { return Status::IOError("JSON parsing failed"); }
+
+ auto it = doc_.FindMember("schema");
+ RETURN_NOT_OBJECT("schema", it, doc_);
+ RETURN_NOT_OK(ReadJsonSchema(it->value, &schema_));
+
+ it = doc_.FindMember("batches");
+ RETURN_NOT_ARRAY("batches", it, doc_);
+ record_batches_ = &it->value;
+
+ return Status::OK();
+ }
+
+ Status GetRecordBatch(int i, std::shared_ptr<RecordBatch>* batch) const {
+ DCHECK_GE(i, 0) << "i out of bounds";
+ DCHECK_LT(i, static_cast<int>(record_batches_->GetArray().Size()))
+ << "i out of bounds";
+
+ const auto& batch_val = record_batches_->GetArray()[i];
+ DCHECK(batch_val.IsObject());
+
+ const auto& batch_obj = batch_val.GetObject();
+
+ auto it = batch_obj.FindMember("count");
+ RETURN_NOT_INT("count", it, batch_obj);
+ int32_t num_rows = static_cast<int32_t>(it->value.GetInt());
+
+ it = batch_obj.FindMember("columns");
+ RETURN_NOT_ARRAY("columns", it, batch_obj);
+ const auto& json_columns = it->value.GetArray();
+
+ std::vector<std::shared_ptr<Array>> columns(json_columns.Size());
+ for (size_t i = 0; i < columns.size(); ++i) {
+ const std::shared_ptr<DataType>& type = schema_->field(i)->type;
+ RETURN_NOT_OK(ReadJsonArray(pool_, json_columns[i], type, &columns[i]));
+ }
+
+ *batch = std::make_shared<RecordBatch>(schema_, num_rows, columns);
+ return Status::OK();
+ }
+
+ std::shared_ptr<Schema> schema() const { return schema_; }
+
+ int num_record_batches() const {
+ return static_cast<int>(record_batches_->GetArray().Size());
+ }
+
+ private:
+ MemoryPool* pool_;
+ std::shared_ptr<Buffer> data_;
+ rj::Document doc_;
+
+ const rj::Value* record_batches_;
+
+ std::shared_ptr<Schema> schema_;
+};
+
+JsonReader::JsonReader(MemoryPool* pool, const std::shared_ptr<Buffer>& data) {
+ impl_.reset(new JsonReaderImpl(pool, data));
+}
+
+JsonReader::~JsonReader() {}
+
+Status JsonReader::Open(
+ const std::shared_ptr<Buffer>& data, std::unique_ptr<JsonReader>* reader) {
+ return Open(default_memory_pool(), data, reader);
+}
+
+Status JsonReader::Open(MemoryPool* pool, const std::shared_ptr<Buffer>& data,
+ std::unique_ptr<JsonReader>* reader) {
+ *reader = std::unique_ptr<JsonReader>(new JsonReader(pool, data));
+ return (*reader)->impl_->ParseAndReadSchema();
+}
+
+std::shared_ptr<Schema> JsonReader::schema() const {
+ return impl_->schema();
+}
+
+int JsonReader::num_record_batches() const {
+ return impl_->num_record_batches();
+}
+
+Status JsonReader::GetRecordBatch(int i, std::shared_ptr<RecordBatch>* batch) const {
+ return impl_->GetRecordBatch(i, batch);
+}
+
+} // namespace ipc
+} // namespace arrow
http://git-wip-us.apache.org/repos/asf/arrow/blob/ed6ec3b7/cpp/src/arrow/ipc/json.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/json.h b/cpp/src/arrow/ipc/json.h
new file mode 100644
index 0000000..7395be4
--- /dev/null
+++ b/cpp/src/arrow/ipc/json.h
@@ -0,0 +1,92 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Implement Arrow JSON serialization format
+
+#ifndef ARROW_IPC_JSON_H
+#define ARROW_IPC_JSON_H
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/type_fwd.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace io {
+
+class OutputStream;
+class ReadableFileInterface;
+
+} // namespace io
+
+namespace ipc {
+
+class ARROW_EXPORT JsonWriter {
+ public:
+ ~JsonWriter();
+
+ static Status Open(
+ const std::shared_ptr<Schema>& schema, std::unique_ptr<JsonWriter>* out);
+
+ // TODO(wesm): Write dictionaries
+
+ Status WriteRecordBatch(
+ const std::vector<std::shared_ptr<Array>>& columns, int32_t num_rows);
+
+ Status Finish(std::string* result);
+
+ private:
+ explicit JsonWriter(const std::shared_ptr<Schema>& schema);
+
+ // Hide RapidJSON details from public API
+ class JsonWriterImpl;
+ std::unique_ptr<JsonWriterImpl> impl_;
+};
+
+// TODO(wesm): Read from a file stream rather than an in-memory buffer
+class ARROW_EXPORT JsonReader {
+ public:
+ ~JsonReader();
+
+ static Status Open(MemoryPool* pool, const std::shared_ptr<Buffer>& data,
+ std::unique_ptr<JsonReader>* reader);
+
+ // Use the default memory pool
+ static Status Open(
+ const std::shared_ptr<Buffer>& data, std::unique_ptr<JsonReader>* reader);
+
+ std::shared_ptr<Schema> schema() const;
+
+ int num_record_batches() const;
+
+ // Read a record batch from the file
+ Status GetRecordBatch(int i, std::shared_ptr<RecordBatch>* batch) const;
+
+ private:
+ JsonReader(MemoryPool* pool, const std::shared_ptr<Buffer>& data);
+
+ // Hide RapidJSON details from public API
+ class JsonReaderImpl;
+ std::unique_ptr<JsonReaderImpl> impl_;
+};
+
+} // namespace ipc
+} // namespace arrow
+
+#endif // ARROW_IPC_JSON_H
http://git-wip-us.apache.org/repos/asf/arrow/blob/ed6ec3b7/cpp/src/arrow/ipc/test-common.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/test-common.h b/cpp/src/arrow/ipc/test-common.h
index 784e238..9abc20d 100644
--- a/cpp/src/arrow/ipc/test-common.h
+++ b/cpp/src/arrow/ipc/test-common.h
@@ -27,6 +27,7 @@
#include "arrow/array.h"
#include "arrow/table.h"
#include "arrow/test-util.h"
+#include "arrow/type.h"
#include "arrow/types/list.h"
#include "arrow/types/primitive.h"
#include "arrow/types/string.h"
@@ -39,15 +40,14 @@ namespace arrow {
namespace ipc {
const auto kInt32 = std::make_shared<Int32Type>();
-const auto kListInt32 = std::make_shared<ListType>(kInt32);
-const auto kListListInt32 = std::make_shared<ListType>(kListInt32);
+const auto kListInt32 = list(kInt32);
+const auto kListListInt32 = list(kListInt32);
Status MakeRandomInt32Array(
int32_t length, bool include_nulls, MemoryPool* pool, std::shared_ptr<Array>* out) {
std::shared_ptr<PoolBuffer> data;
test::MakeRandomInt32PoolBuffer(length, pool, &data);
- const auto kInt32 = std::make_shared<Int32Type>();
- Int32Builder builder(pool, kInt32);
+ Int32Builder builder(pool, int32());
if (include_nulls) {
std::shared_ptr<PoolBuffer> valid_bytes;
test::MakeRandomBytePoolBuffer(length, pool, &valid_bytes);
@@ -134,8 +134,8 @@ Status MakeRandomBinaryArray(
Status MakeStringTypesRecordBatch(std::shared_ptr<RecordBatch>* out) {
const int32_t length = 500;
- auto string_type = std::make_shared<StringType>();
- auto binary_type = std::make_shared<BinaryType>();
+ auto string_type = utf8();
+ auto binary_type = binary();
auto f0 = std::make_shared<Field>("f0", string_type);
auto f1 = std::make_shared<Field>("f1", binary_type);
std::shared_ptr<Schema> schema(new Schema({f0, f1}));
@@ -233,7 +233,7 @@ Status MakeDeeplyNestedList(std::shared_ptr<RecordBatch>* out) {
const bool include_nulls = true;
RETURN_NOT_OK(MakeRandomInt32Array(1000, include_nulls, pool, &array));
for (int i = 0; i < 63; ++i) {
- type = std::static_pointer_cast<DataType>(std::make_shared<ListType>(type));
+ type = std::static_pointer_cast<DataType>(list(type));
RETURN_NOT_OK(MakeRandomListArray(array, batch_length, include_nulls, pool, &array));
}
http://git-wip-us.apache.org/repos/asf/arrow/blob/ed6ec3b7/cpp/src/arrow/schema-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/schema-test.cc b/cpp/src/arrow/schema-test.cc
index 8cc80be..4826199 100644
--- a/cpp/src/arrow/schema-test.cc
+++ b/cpp/src/arrow/schema-test.cc
@@ -29,23 +29,21 @@ using std::vector;
namespace arrow {
-const auto INT32 = std::make_shared<Int32Type>();
-
TEST(TestField, Basics) {
- Field f0("f0", INT32);
- Field f0_nn("f0", INT32, false);
+ Field f0("f0", int32());
+ Field f0_nn("f0", int32(), false);
ASSERT_EQ(f0.name, "f0");
- ASSERT_EQ(f0.type->ToString(), INT32->ToString());
+ ASSERT_EQ(f0.type->ToString(), int32()->ToString());
ASSERT_TRUE(f0.nullable);
ASSERT_FALSE(f0_nn.nullable);
}
TEST(TestField, Equals) {
- Field f0("f0", INT32);
- Field f0_nn("f0", INT32, false);
- Field f0_other("f0", INT32);
+ Field f0("f0", int32());
+ Field f0_nn("f0", int32(), false);
+ Field f0_other("f0", int32());
ASSERT_EQ(f0, f0_other);
ASSERT_NE(f0, f0_nn);
@@ -57,11 +55,11 @@ class TestSchema : public ::testing::Test {
};
TEST_F(TestSchema, Basics) {
- auto f0 = std::make_shared<Field>("f0", INT32);
- auto f1 = std::make_shared<Field>("f1", std::make_shared<UInt8Type>(), false);
- auto f1_optional = std::make_shared<Field>("f1", std::make_shared<UInt8Type>());
+ auto f0 = field("f0", int32());
+ auto f1 = field("f1", uint8(), false);
+ auto f1_optional = field("f1", uint8());
- auto f2 = std::make_shared<Field>("f2", std::make_shared<StringType>());
+ auto f2 = field("f2", utf8());
vector<shared_ptr<Field>> fields = {f0, f1, f2};
auto schema = std::make_shared<Schema>(fields);
@@ -83,11 +81,10 @@ TEST_F(TestSchema, Basics) {
}
TEST_F(TestSchema, ToString) {
- auto f0 = std::make_shared<Field>("f0", INT32);
- auto f1 = std::make_shared<Field>("f1", std::make_shared<UInt8Type>(), false);
- auto f2 = std::make_shared<Field>("f2", std::make_shared<StringType>());
- auto f3 = std::make_shared<Field>(
- "f3", std::make_shared<ListType>(std::make_shared<Int16Type>()));
+ auto f0 = field("f0", int32());
+ auto f1 = field("f1", uint8(), false);
+ auto f2 = field("f2", utf8());
+ auto f3 = field("f3", list(int16()));
vector<shared_ptr<Field>> fields = {f0, f1, f2, f3};
auto schema = std::make_shared<Schema>(fields);
@@ -101,4 +98,25 @@ f3: list<item: int16>)";
ASSERT_EQ(expected, result);
}
+TEST_F(TestSchema, GetFieldByName) {
+ auto f0 = field("f0", int32());
+ auto f1 = field("f1", uint8(), false);
+ auto f2 = field("f2", utf8());
+ auto f3 = field("f3", list(int16()));
+
+ vector<shared_ptr<Field>> fields = {f0, f1, f2, f3};
+ auto schema = std::make_shared<Schema>(fields);
+
+ std::shared_ptr<Field> result;
+
+ result = schema->GetFieldByName("f1");
+ ASSERT_TRUE(f1->Equals(result));
+
+ result = schema->GetFieldByName("f3");
+ ASSERT_TRUE(f3->Equals(result));
+
+ result = schema->GetFieldByName("not-found");
+ ASSERT_TRUE(result == nullptr);
+}
+
} // namespace arrow
http://git-wip-us.apache.org/repos/asf/arrow/blob/ed6ec3b7/cpp/src/arrow/schema.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/schema.cc b/cpp/src/arrow/schema.cc
index ff3ea19..cd8256e 100644
--- a/cpp/src/arrow/schema.cc
+++ b/cpp/src/arrow/schema.cc
@@ -42,6 +42,21 @@ bool Schema::Equals(const std::shared_ptr<Schema>& other) const {
return Equals(*other.get());
}
+std::shared_ptr<Field> Schema::GetFieldByName(const std::string& name) {
+ if (fields_.size() > 0 && name_to_index_.size() == 0) {
+ for (size_t i = 0; i < fields_.size(); ++i) {
+ name_to_index_[fields_[i]->name] = i;
+ }
+ }
+
+ auto it = name_to_index_.find(name);
+ if (it == name_to_index_.end()) {
+ return nullptr;
+ } else {
+ return fields_[it->second];
+ }
+}
+
std::string Schema::ToString() const {
std::stringstream buffer;
http://git-wip-us.apache.org/repos/asf/arrow/blob/ed6ec3b7/cpp/src/arrow/schema.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/schema.h b/cpp/src/arrow/schema.h
index 4301968..0e1ab5c 100644
--- a/cpp/src/arrow/schema.h
+++ b/cpp/src/arrow/schema.h
@@ -20,14 +20,14 @@
#include <memory>
#include <string>
+#include <unordered_map>
#include <vector>
+#include "arrow/type.h"
#include "arrow/util/visibility.h"
namespace arrow {
-struct Field;
-
class ARROW_EXPORT Schema {
public:
explicit Schema(const std::vector<std::shared_ptr<Field>>& fields);
@@ -37,7 +37,12 @@ class ARROW_EXPORT Schema {
bool Equals(const std::shared_ptr<Schema>& other) const;
// Return the ith schema element. Does not boundscheck
- const std::shared_ptr<Field>& field(int i) const { return fields_[i]; }
+ std::shared_ptr<Field> field(int i) const { return fields_[i]; }
+
+ // Returns nullptr if name not found
+ std::shared_ptr<Field> GetFieldByName(const std::string& name);
+
+ const std::vector<std::shared_ptr<Field>>& fields() const { return fields_; }
// Render a string representation of the schema suitable for debugging
std::string ToString() const;
@@ -46,6 +51,7 @@ class ARROW_EXPORT Schema {
private:
std::vector<std::shared_ptr<Field>> fields_;
+ std::unordered_map<std::string, int> name_to_index_;
};
} // namespace arrow
http://git-wip-us.apache.org/repos/asf/arrow/blob/ed6ec3b7/cpp/src/arrow/test-util.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/test-util.h b/cpp/src/arrow/test-util.h
index ac56f5e..ab4b980 100644
--- a/cpp/src/arrow/test-util.h
+++ b/cpp/src/arrow/test-util.h
@@ -27,6 +27,7 @@
#include "gtest/gtest.h"
+#include "arrow/array.h"
#include "arrow/column.h"
#include "arrow/schema.h"
#include "arrow/table.h"
@@ -102,20 +103,57 @@ void random_real(int n, uint32_t seed, T min_value, T max_value, std::vector<T>*
}
template <typename T>
-std::shared_ptr<Buffer> to_buffer(const std::vector<T>& values) {
+std::shared_ptr<Buffer> GetBufferFromVector(const std::vector<T>& values) {
return std::make_shared<Buffer>(
reinterpret_cast<const uint8_t*>(values.data()), values.size() * sizeof(T));
}
+template <typename T>
+inline Status CopyBufferFromVector(
+ const std::vector<T>& values, std::shared_ptr<Buffer>* result) {
+ int64_t nbytes = static_cast<int>(values.size()) * sizeof(T);
+
+ auto buffer = std::make_shared<PoolBuffer>(default_memory_pool());
+ RETURN_NOT_OK(buffer->Resize(nbytes));
+ memcpy(buffer->mutable_data(), values.data(), nbytes);
+
+ *result = buffer;
+ return Status::OK();
+}
+
+static inline Status GetBitmapFromBoolVector(
+ const std::vector<bool>& is_valid, std::shared_ptr<Buffer>* result) {
+ int length = static_cast<int>(is_valid.size());
+
+ std::shared_ptr<MutableBuffer> buffer;
+ RETURN_NOT_OK(GetEmptyBitmap(default_memory_pool(), length, &buffer));
+
+ uint8_t* bitmap = buffer->mutable_data();
+ for (int i = 0; i < length; ++i) {
+ if (is_valid[i]) { BitUtil::SetBit(bitmap, i); }
+ }
+
+ *result = buffer;
+ return Status::OK();
+}
+
// Sets approximately pct_null of the first n bytes in null_bytes to zero
// and the rest to non-zero (true) values.
-void random_null_bytes(int64_t n, double pct_null, uint8_t* null_bytes) {
+static inline void random_null_bytes(int64_t n, double pct_null, uint8_t* null_bytes) {
Random rng(random_seed());
for (int i = 0; i < n; ++i) {
null_bytes[i] = rng.NextDoubleFraction() > pct_null;
}
}
+static inline void random_is_valid(
+ int64_t n, double pct_null, std::vector<bool>* is_valid) {
+ Random rng(random_seed());
+ for (int i = 0; i < n; ++i) {
+ is_valid->push_back(rng.NextDoubleFraction() > pct_null);
+ }
+}
+
static inline void random_bytes(int n, uint32_t seed, uint8_t* out) {
std::mt19937 gen(seed);
std::uniform_int_distribution<int> d(0, 255);
@@ -125,6 +163,15 @@ static inline void random_bytes(int n, uint32_t seed, uint8_t* out) {
}
}
+static inline void random_ascii(int n, uint32_t seed, uint8_t* out) {
+ std::mt19937 gen(seed);
+ std::uniform_int_distribution<int> d(65, 122);
+
+ for (int i = 0; i < n; ++i) {
+ out[i] = d(gen) & 0xFF;
+ }
+}
+
template <typename T>
void rand_uniform_int(int n, uint32_t seed, T min_value, T max_value, T* out) {
DCHECK(out);