You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2020/12/03 21:01:28 UTC
[GitHub] [arrow] xhochy commented on a change in pull request #8648: ARROW-7906: [C++] Add ORC write support
xhochy commented on a change in pull request #8648:
URL: https://github.com/apache/arrow/pull/8648#discussion_r535594787
##########
File path: cpp/src/arrow/adapters/orc/adapter.cc
##########
@@ -473,6 +453,80 @@ int64_t ORCFileReader::NumberOfStripes() { return impl_->NumberOfStripes(); }
int64_t ORCFileReader::NumberOfRows() { return impl_->NumberOfRows(); }
+class ORCFileWriter::Impl {
+ public:
+ Status Open(Schema* schema, const std::shared_ptr<io::FileOutputStream>& file,
+ std::shared_ptr<liborc::WriterOptions> options,
+ std::shared_ptr<ArrowWriterOptions> arrow_options,
Review comment:
Pass in `shared_ptr` instances via reference.
```suggestion
const std::shared_ptr<liborc::WriterOptions>& options,
const std::shared_ptr<ArrowWriterOptions>& arrow_options,
```
##########
File path: cpp/src/arrow/adapters/orc/adapter_test.cc
##########
@@ -157,4 +217,15960 @@ TEST(TestAdapter, readIntAndStringFileMultipleStripes) {
EXPECT_TRUE(stripe_reader->ReadNext(&record_batch).ok());
}
}
+
+// Arrow2ORC type converter tests
+
+TEST(TestAdapterWriteConverter, typeBool) {
+ DataType* type = boolean().get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
Review comment:
Use `ARROW_EXPECT_OK` instead of silencing the warning using `(void)`.
```suggestion
ARROW_EXPECT_OK(adapters::orc::GetORCType(type, &out));
```
##########
File path: cpp/src/arrow/adapters/orc/adapter_test.cc
##########
@@ -157,4 +217,15960 @@ TEST(TestAdapter, readIntAndStringFileMultipleStripes) {
EXPECT_TRUE(stripe_reader->ReadNext(&record_batch).ok());
}
}
+
+// Arrow2ORC type converter tests
+
+TEST(TestAdapterWriteConverter, typeBool) {
+ DataType* type = boolean().get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::BOOLEAN);
+}
+TEST(TestAdapterWriteConverter, typeInt8) {
+ DataType* type = int8().get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::BYTE);
+}
+TEST(TestAdapterWriteConverter, typeInt16) {
+ DataType* type = int16().get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::SHORT);
+}
+TEST(TestAdapterWriteConverter, typeInt32) {
+ DataType* type = int32().get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::INT);
+}
+TEST(TestAdapterWriteConverter, typeInt64) {
+ DataType* type = int64().get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::LONG);
+}
+TEST(TestAdapterWriteConverter, typeFloat) {
+ DataType* type = float32().get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::FLOAT);
+}
+TEST(TestAdapterWriteConverter, typeDouble) {
+ DataType* type = float64().get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::DOUBLE);
+}
+TEST(TestAdapterWriteConverter, typeString) {
+ DataType* type = utf8().get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::STRING);
+}
+TEST(TestAdapterWriteConverter, typeLargeString) {
+ DataType* type = large_utf8().get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::STRING);
+}
+TEST(TestAdapterWriteConverter, typeBinary) {
+ DataType* type = binary().get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::BINARY);
+}
+TEST(TestAdapterWriteConverter, typeLargeBinary) {
+ DataType* type = large_binary().get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::BINARY);
+}
+TEST(TestAdapterWriteConverter, typeFixedSizeBinary) {
+ DataType* type = fixed_size_binary(3).get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::BINARY);
+}
+TEST(TestAdapterWriteConverter, typeFixedSizeBinaryZero) {
+ DataType* type = fixed_size_binary(0).get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::BINARY);
+}
+TEST(TestAdapterWriteConverter, typeDate32) {
+ DataType* type = date32().get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::DATE);
+}
+TEST(TestAdapterWriteConverter, typeDate64) {
+ DataType* type = date64().get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::TIMESTAMP);
+}
+TEST(TestAdapterWriteConverter, typeTimestampSecond) {
+ DataType* type = timestamp(TimeUnit::type::SECOND).get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::TIMESTAMP);
+}
+TEST(TestAdapterWriteConverter, typeTimestampMilli) {
+ DataType* type = timestamp(TimeUnit::type::MILLI).get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::TIMESTAMP);
+}
+TEST(TestAdapterWriteConverter, typeTimestampMicro) {
+ DataType* type = timestamp(TimeUnit::type::MICRO).get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::TIMESTAMP);
+}
+TEST(TestAdapterWriteConverter, typeTimestampNano) {
+ DataType* type = timestamp(TimeUnit::type::NANO).get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::TIMESTAMP);
+}
+TEST(TestAdapterWriteConverter, typeDecimal) {
+ DataType* type = decimal(32, 5).get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::DECIMAL);
+ EXPECT_EQ(out->getPrecision(), 32);
+ EXPECT_EQ(out->getScale(), 5);
+}
+TEST(TestAdapterWriteConverter, typeList) {
+ auto sharedPtrArrowType = list(std::make_shared<Field>("a", int32()));
+ DataType* type = sharedPtrArrowType.get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getSubtypeCount(), 1);
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::LIST);
+ EXPECT_EQ(out->getSubtype(0)->getKind(), liborc::TypeKind::INT);
+}
+TEST(TestAdapterWriteConverter, typeLargeList) {
+ auto sharedPtrArrowType = large_list(std::make_shared<Field>("a", int32()));
+ DataType* type = sharedPtrArrowType.get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getSubtypeCount(), 1);
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::LIST);
+ EXPECT_EQ(out->getSubtype(0)->getKind(), liborc::TypeKind::INT);
+}
+TEST(TestAdapterWriteConverter, typeFixedSizeList) {
+ auto sharedPtrArrowType = fixed_size_list(std::make_shared<Field>("a", int32()), 3);
+ DataType* type = sharedPtrArrowType.get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getSubtypeCount(), 1);
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::LIST);
+ EXPECT_EQ(out->getSubtype(0)->getKind(), liborc::TypeKind::INT);
+}
+TEST(TestAdapterWriteConverter, typeFixedSizeListZero) {
+ auto sharedPtrArrowType = fixed_size_list(std::make_shared<Field>("a", int32()), 0);
+ DataType* type = sharedPtrArrowType.get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getSubtypeCount(), 1);
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::LIST);
+ EXPECT_EQ(out->getSubtype(0)->getKind(), liborc::TypeKind::INT);
+}
+TEST(TestAdapterWriteConverter, typeStructTrivial) {
+ std::vector<std::shared_ptr<Field>> xFields;
+ auto sharedPtrArrowType = struct_(xFields);
+ DataType* type = sharedPtrArrowType.get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getSubtypeCount(), 0);
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::STRUCT);
+}
+TEST(TestAdapterWriteConverter, typeStructSingleton) {
+ std::vector<std::shared_ptr<Field>> xFields;
+ xFields.push_back(std::make_shared<Field>("a", utf8()));
Review comment:
This can be condensed to
```suggestion
std::vector<std::shared_ptr<Field>> xFields{arrow::field("a", utf8())};
```
##########
File path: cpp/src/arrow/adapters/orc/adapter_test.cc
##########
@@ -157,4 +217,15960 @@ TEST(TestAdapter, readIntAndStringFileMultipleStripes) {
EXPECT_TRUE(stripe_reader->ReadNext(&record_batch).ok());
}
}
+
+// Arrow2ORC type converter tests
+
+TEST(TestAdapterWriteConverter, typeBool) {
+ DataType* type = boolean().get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::BOOLEAN);
+}
+TEST(TestAdapterWriteConverter, typeInt8) {
+ DataType* type = int8().get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::BYTE);
+}
+TEST(TestAdapterWriteConverter, typeInt16) {
+ DataType* type = int16().get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::SHORT);
+}
+TEST(TestAdapterWriteConverter, typeInt32) {
+ DataType* type = int32().get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::INT);
+}
+TEST(TestAdapterWriteConverter, typeInt64) {
+ DataType* type = int64().get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::LONG);
+}
+TEST(TestAdapterWriteConverter, typeFloat) {
+ DataType* type = float32().get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::FLOAT);
+}
+TEST(TestAdapterWriteConverter, typeDouble) {
+ DataType* type = float64().get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::DOUBLE);
+}
+TEST(TestAdapterWriteConverter, typeString) {
+ DataType* type = utf8().get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::STRING);
+}
+TEST(TestAdapterWriteConverter, typeLargeString) {
+ DataType* type = large_utf8().get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::STRING);
+}
+TEST(TestAdapterWriteConverter, typeBinary) {
+ DataType* type = binary().get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::BINARY);
+}
+TEST(TestAdapterWriteConverter, typeLargeBinary) {
+ DataType* type = large_binary().get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::BINARY);
+}
+TEST(TestAdapterWriteConverter, typeFixedSizeBinary) {
+ DataType* type = fixed_size_binary(3).get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::BINARY);
+}
+TEST(TestAdapterWriteConverter, typeFixedSizeBinaryZero) {
+ DataType* type = fixed_size_binary(0).get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::BINARY);
+}
+TEST(TestAdapterWriteConverter, typeDate32) {
+ DataType* type = date32().get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::DATE);
+}
+TEST(TestAdapterWriteConverter, typeDate64) {
+ DataType* type = date64().get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::TIMESTAMP);
+}
+TEST(TestAdapterWriteConverter, typeTimestampSecond) {
+ DataType* type = timestamp(TimeUnit::type::SECOND).get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::TIMESTAMP);
+}
+TEST(TestAdapterWriteConverter, typeTimestampMilli) {
+ DataType* type = timestamp(TimeUnit::type::MILLI).get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::TIMESTAMP);
+}
+TEST(TestAdapterWriteConverter, typeTimestampMicro) {
+ DataType* type = timestamp(TimeUnit::type::MICRO).get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::TIMESTAMP);
+}
+TEST(TestAdapterWriteConverter, typeTimestampNano) {
+ DataType* type = timestamp(TimeUnit::type::NANO).get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::TIMESTAMP);
+}
+TEST(TestAdapterWriteConverter, typeDecimal) {
+ DataType* type = decimal(32, 5).get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::DECIMAL);
+ EXPECT_EQ(out->getPrecision(), 32);
+ EXPECT_EQ(out->getScale(), 5);
+}
+TEST(TestAdapterWriteConverter, typeList) {
+ auto sharedPtrArrowType = list(std::make_shared<Field>("a", int32()));
+ DataType* type = sharedPtrArrowType.get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getSubtypeCount(), 1);
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::LIST);
+ EXPECT_EQ(out->getSubtype(0)->getKind(), liborc::TypeKind::INT);
+}
+TEST(TestAdapterWriteConverter, typeLargeList) {
+ auto sharedPtrArrowType = large_list(std::make_shared<Field>("a", int32()));
+ DataType* type = sharedPtrArrowType.get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getSubtypeCount(), 1);
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::LIST);
+ EXPECT_EQ(out->getSubtype(0)->getKind(), liborc::TypeKind::INT);
+}
+TEST(TestAdapterWriteConverter, typeFixedSizeList) {
+ auto sharedPtrArrowType = fixed_size_list(std::make_shared<Field>("a", int32()), 3);
+ DataType* type = sharedPtrArrowType.get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getSubtypeCount(), 1);
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::LIST);
+ EXPECT_EQ(out->getSubtype(0)->getKind(), liborc::TypeKind::INT);
+}
+TEST(TestAdapterWriteConverter, typeFixedSizeListZero) {
+ auto sharedPtrArrowType = fixed_size_list(std::make_shared<Field>("a", int32()), 0);
+ DataType* type = sharedPtrArrowType.get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getSubtypeCount(), 1);
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::LIST);
+ EXPECT_EQ(out->getSubtype(0)->getKind(), liborc::TypeKind::INT);
+}
+TEST(TestAdapterWriteConverter, typeStructTrivial) {
+ std::vector<std::shared_ptr<Field>> xFields;
+ auto sharedPtrArrowType = struct_(xFields);
+ DataType* type = sharedPtrArrowType.get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getSubtypeCount(), 0);
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::STRUCT);
+}
+TEST(TestAdapterWriteConverter, typeStructSingleton) {
+ std::vector<std::shared_ptr<Field>> xFields;
+ xFields.push_back(std::make_shared<Field>("a", utf8()));
+ auto sharedPtrArrowType = struct_(xFields);
+ DataType* type = sharedPtrArrowType.get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getSubtypeCount(), 1);
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::STRUCT);
+ EXPECT_EQ(out->getFieldName(0), "a");
+ EXPECT_EQ(out->getSubtype(0)->getKind(), liborc::TypeKind::STRING);
+}
+TEST(TestAdapterWriteConverter, typeStruct) {
+ std::vector<std::shared_ptr<Field>> xFields;
+ xFields.push_back(std::make_shared<Field>("a", utf8()));
+ xFields.push_back(std::make_shared<Field>("b", int32()));
+ auto sharedPtrArrowType = struct_(xFields);
+ DataType* type = sharedPtrArrowType.get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getSubtypeCount(), 2);
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::STRUCT);
+ EXPECT_EQ(out->getFieldName(0), "a");
+ EXPECT_EQ(out->getFieldName(1), "b");
+ EXPECT_EQ(out->getSubtype(0)->getKind(), liborc::TypeKind::STRING);
+ EXPECT_EQ(out->getSubtype(1)->getKind(), liborc::TypeKind::INT);
+}
+TEST(TestAdapterWriteConverter, typeMap) {
+ auto sharedPtrArrowType = map(utf8(), int32());
+ DataType* type = sharedPtrArrowType.get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getSubtypeCount(), 2);
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::MAP);
+ EXPECT_EQ(out->getSubtype(0)->getKind(), liborc::TypeKind::STRING);
+ EXPECT_EQ(out->getSubtype(1)->getKind(), liborc::TypeKind::INT);
+}
+TEST(TestAdapterWriteConverter, typeDenseUnionTrivial) {
+ std::vector<std::shared_ptr<Field>> xFields;
+ auto sharedPtrArrowType = dense_union(xFields);
+ DataType* type = sharedPtrArrowType.get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getSubtypeCount(), 0);
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::UNION);
+}
+TEST(TestAdapterWriteConverter, typeDenseUnionSingleton) {
+ std::vector<std::shared_ptr<Field>> xFields;
+ xFields.push_back(std::make_shared<Field>("a", utf8()));
+ auto sharedPtrArrowType = dense_union(xFields);
+ DataType* type = sharedPtrArrowType.get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getSubtypeCount(), 1);
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::UNION);
+ EXPECT_EQ(out->getSubtype(0)->getKind(), liborc::TypeKind::STRING);
+}
+TEST(TestAdapterWriteConverter, typeDenseUnion) {
+ std::vector<std::shared_ptr<Field>> xFields;
+ xFields.push_back(std::make_shared<Field>("a", utf8()));
+ xFields.push_back(std::make_shared<Field>("b", int32()));
+ auto sharedPtrArrowType = dense_union(xFields);
+ DataType* type = sharedPtrArrowType.get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getSubtypeCount(), 2);
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::UNION);
+ EXPECT_EQ(out->getSubtype(0)->getKind(), liborc::TypeKind::STRING);
+ EXPECT_EQ(out->getSubtype(1)->getKind(), liborc::TypeKind::INT);
+}
+TEST(TestAdapterWriteConverter, typeSparseUnionTrivial) {
+ std::vector<std::shared_ptr<Field>> xFields;
+ auto sharedPtrArrowType = sparse_union(xFields);
+ DataType* type = sharedPtrArrowType.get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getSubtypeCount(), 0);
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::UNION);
+}
+TEST(TestAdapterWriteConverter, typeSparseUnionSingleton) {
+ std::vector<std::shared_ptr<Field>> xFields;
+ xFields.push_back(std::make_shared<Field>("b", int32()));
+ auto sharedPtrArrowType = sparse_union(xFields);
+ DataType* type = sharedPtrArrowType.get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getSubtypeCount(), 1);
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::UNION);
+ EXPECT_EQ(out->getSubtype(0)->getKind(), liborc::TypeKind::INT);
+}
+TEST(TestAdapterWriteConverter, typeSparseUnion) {
+ std::vector<std::shared_ptr<Field>> xFields;
+ xFields.push_back(std::make_shared<Field>("a", utf8()));
+ xFields.push_back(std::make_shared<Field>("b", int32()));
+ auto sharedPtrArrowType = sparse_union(xFields);
+ DataType* type = sharedPtrArrowType.get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getSubtypeCount(), 2);
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::UNION);
+ EXPECT_EQ(out->getSubtype(0)->getKind(), liborc::TypeKind::STRING);
+ EXPECT_EQ(out->getSubtype(1)->getKind(), liborc::TypeKind::INT);
+}
+TEST(TestAdapterWriteConverter, typeListOfList) {
+ auto sharedPtrArrowSubtype = list(std::make_shared<Field>("a", int32()));
+ auto sharedPtrArrowType = list(std::make_shared<Field>("a", sharedPtrArrowSubtype));
+ DataType* type = sharedPtrArrowType.get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getSubtypeCount(), 1);
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::LIST);
+ EXPECT_EQ(out->getSubtype(0)->getSubtypeCount(), 1);
+ EXPECT_EQ(out->getSubtype(0)->getKind(), liborc::TypeKind::LIST);
+ EXPECT_EQ(out->getSubtype(0)->getSubtype(0)->getKind(), liborc::TypeKind::INT);
+}
+TEST(TestAdapterWriteConverter, typeListOfMap) {
+ auto sharedPtrArrowSubtype = map(utf8(), int32());
+ auto sharedPtrArrowType = list(std::make_shared<Field>("a", sharedPtrArrowSubtype));
+ DataType* type = sharedPtrArrowType.get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getSubtypeCount(), 1);
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::LIST);
+ EXPECT_EQ(out->getSubtype(0)->getSubtypeCount(), 2);
+ EXPECT_EQ(out->getSubtype(0)->getKind(), liborc::TypeKind::MAP);
+ EXPECT_EQ(out->getSubtype(0)->getSubtype(0)->getKind(), liborc::TypeKind::STRING);
+ EXPECT_EQ(out->getSubtype(0)->getSubtype(1)->getKind(), liborc::TypeKind::INT);
+}
+TEST(TestAdapterWriteConverter, typeListOfStructOfLists) {
+ auto sharedPtrArrowSubsubtype0 = list(std::make_shared<Field>("a", int8()));
+ auto sharedPtrArrowSubsubtype1 = list(std::make_shared<Field>("b", float64()));
+ auto sharedPtrArrowSubsubtype2 = list(std::make_shared<Field>("c", date32()));
+ std::vector<std::shared_ptr<Field>> xFields;
+ xFields.push_back(std::make_shared<Field>("a", sharedPtrArrowSubsubtype0));
+ xFields.push_back(std::make_shared<Field>("b", sharedPtrArrowSubsubtype1));
+ xFields.push_back(std::make_shared<Field>("c", sharedPtrArrowSubsubtype2));
+ auto sharedPtrArrowSubtype = struct_(xFields);
+ auto sharedPtrArrowType = list(std::make_shared<Field>("x", sharedPtrArrowSubtype));
+ DataType* type = sharedPtrArrowType.get();
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(type, &out));
+ EXPECT_EQ(out->getSubtypeCount(), 1);
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::LIST);
+ EXPECT_EQ(out->getSubtype(0)->getSubtypeCount(), 3);
+ EXPECT_EQ(out->getSubtype(0)->getKind(), liborc::TypeKind::STRUCT);
+ EXPECT_EQ(out->getSubtype(0)->getSubtype(0)->getSubtypeCount(), 1);
+ EXPECT_EQ(out->getSubtype(0)->getSubtype(0)->getKind(), liborc::TypeKind::LIST);
+ EXPECT_EQ(out->getSubtype(0)->getSubtype(1)->getSubtypeCount(), 1);
+ EXPECT_EQ(out->getSubtype(0)->getSubtype(1)->getKind(), liborc::TypeKind::LIST);
+ EXPECT_EQ(out->getSubtype(0)->getSubtype(2)->getSubtypeCount(), 1);
+ EXPECT_EQ(out->getSubtype(0)->getSubtype(2)->getKind(), liborc::TypeKind::LIST);
+ EXPECT_EQ(out->getSubtype(0)->getSubtype(0)->getSubtype(0)->getKind(),
+ liborc::TypeKind::BYTE);
+ EXPECT_EQ(out->getSubtype(0)->getSubtype(1)->getSubtype(0)->getKind(),
+ liborc::TypeKind::DOUBLE);
+ EXPECT_EQ(out->getSubtype(0)->getSubtype(2)->getSubtype(0)->getKind(),
+ liborc::TypeKind::DATE);
+}
+TEST(TestAdapterWriteConverter, schemaTrivial) {
+ std::vector<std::shared_ptr<Field>> xFields;
+ std::shared_ptr<Schema> sharedPtrSchema = std::make_shared<Schema>(xFields);
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(sharedPtrSchema.get(), &out));
+ EXPECT_EQ(out->getSubtypeCount(), 0);
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::STRUCT);
+}
+TEST(TestAdapterWriteConverter, schemaSingleton) {
+ std::vector<std::shared_ptr<Field>> xFields;
+ xFields.push_back(std::make_shared<Field>("a", utf8()));
+ std::shared_ptr<Schema> sharedPtrSchema = std::make_shared<Schema>(xFields);
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(sharedPtrSchema.get(), &out));
+ EXPECT_EQ(out->getSubtypeCount(), 1);
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::STRUCT);
+ EXPECT_EQ(out->getFieldName(0), "a");
+ EXPECT_EQ(out->getSubtype(0)->getKind(), liborc::TypeKind::STRING);
+}
+TEST(TestAdapterWriteConverter, schemaMixed1) {
+ auto sharedPtrArrowSubsubtype0 = list(std::make_shared<Field>("a", large_utf8()));
+ auto sharedPtrArrowSubtype0 =
+ list(std::make_shared<Field>("a", sharedPtrArrowSubsubtype0));
+ auto sharedPtrArrowSubtype1 = list(std::make_shared<Field>("b", decimal(30, 4)));
+ auto sharedPtrArrowSubtype2 =
+ list(std::make_shared<Field>("c", timestamp(TimeUnit::type::MICRO)));
+ std::vector<std::shared_ptr<Field>> xFields;
+ xFields.push_back(std::make_shared<Field>("a", sharedPtrArrowSubtype0));
+ xFields.push_back(std::make_shared<Field>("b", sharedPtrArrowSubtype1));
+ xFields.push_back(std::make_shared<Field>("c", sharedPtrArrowSubtype2));
+ xFields.push_back(std::make_shared<Field>("d", boolean()));
+ xFields.push_back(std::make_shared<Field>("e", fixed_size_binary(5)));
+ std::shared_ptr<Schema> sharedPtrSchema = std::make_shared<Schema>(xFields);
+ ORC_UNIQUE_PTR<liborc::Type> out;
+ (void)(adapters::orc::GetORCType(sharedPtrSchema.get(), &out));
+ EXPECT_EQ(out->getSubtypeCount(), 5);
+ EXPECT_EQ(out->getKind(), liborc::TypeKind::STRUCT);
+ EXPECT_EQ(out->getFieldName(0), "a");
+ EXPECT_EQ(out->getFieldName(1), "b");
+ EXPECT_EQ(out->getFieldName(2), "c");
+ EXPECT_EQ(out->getFieldName(3), "d");
+ EXPECT_EQ(out->getFieldName(4), "e");
+ EXPECT_EQ(out->getSubtype(0)->getSubtypeCount(), 1);
+ EXPECT_EQ(out->getSubtype(0)->getKind(), liborc::TypeKind::LIST);
+ EXPECT_EQ(out->getSubtype(1)->getSubtypeCount(), 1);
+ EXPECT_EQ(out->getSubtype(1)->getKind(), liborc::TypeKind::LIST);
+ EXPECT_EQ(out->getSubtype(2)->getSubtypeCount(), 1);
+ EXPECT_EQ(out->getSubtype(2)->getKind(), liborc::TypeKind::LIST);
+ EXPECT_EQ(out->getSubtype(3)->getKind(), liborc::TypeKind::BOOLEAN);
+ EXPECT_EQ(out->getSubtype(4)->getKind(), liborc::TypeKind::BINARY);
+ EXPECT_EQ(out->getSubtype(0)->getSubtype(0)->getSubtypeCount(), 1);
+ EXPECT_EQ(out->getSubtype(0)->getSubtype(0)->getKind(), liborc::TypeKind::LIST);
+ EXPECT_EQ(out->getSubtype(1)->getSubtype(0)->getKind(), liborc::TypeKind::DECIMAL);
+ EXPECT_EQ(out->getSubtype(2)->getSubtype(0)->getKind(), liborc::TypeKind::TIMESTAMP);
+ EXPECT_EQ(out->getSubtype(0)->getSubtype(0)->getSubtype(0)->getKind(),
+ liborc::TypeKind::STRING);
+}
+
+// WriteORC tests
+// TEST(TestAdapterWriteNumerical, writeBoolEmpty0) {
+// BooleanBuilder builder;
+// std::shared_ptr<Array> array;
+// (void)(builder.Finish(&array));
+// std::shared_ptr<Table> table = std::make_shared<Table>({array},{std::String("a")});
+// MemoryOutputStreamV2 file(DEFAULT_SMALL_MEM_STREAM_SIZE);
+// std::unique_ptr<adapters::orc::ORCFileWriter>* writer;
+// ORCFileWriter::Open(table->schema().get(),
+// const std::shared_ptr<io::FileOutputStream>& file,
+// std::shared_ptr<liborc::WriterOptions> options,
+// std::shared_ptr<ArrowWriterOptions> arrow_options,
+// std::unique_ptr<ORCFileWriter>* writer
+// )
+// }
+
+// Numeric
+
+// Bool
+TEST(TestAdapterWriteNumerical, writeBoolEmpty) {
+ BooleanBuilder builder;
+ std::shared_ptr<Array> array;
+ (void)(builder.Finish(&array));
+ MemoryOutputStream mem_stream(DEFAULT_SMALL_MEM_STREAM_SIZE);
+ ORC_UNIQUE_PTR<liborc::Type> schema(
+ liborc::Type::buildTypeFromString("struct<x:boolean>"));
+ liborc::WriterOptions options;
+ ORC_UNIQUE_PTR<liborc::Writer> writer = createWriter(*schema, &mem_stream, options);
+ uint64_t batchSize = 1024;
+ ORC_UNIQUE_PTR<liborc::ColumnVectorBatch> batch = writer->createRowBatch(batchSize);
+ liborc::StructVectorBatch* root =
+ internal::checked_cast<liborc::StructVectorBatch*>(batch.get());
+ liborc::LongVectorBatch* x =
+ internal::checked_cast<liborc::LongVectorBatch*>(root->fields[0]);
+ DataType* arrowType = boolean().get();
+ int64_t arrowOffset = 0;
+ int64_t orcOffset = 0;
+ Status st = adapters::orc::FillBatch(arrowType, x, arrowOffset, orcOffset, batchSize,
+ array.get());
+ if (!st.ok()) {
+ FAIL() << "ORC ColumnBatch not successfully filled";
+ }
+ EXPECT_EQ(x->numElements, 0);
+ EXPECT_FALSE(x->hasNulls);
+ EXPECT_EQ(arrowOffset, 0);
+ EXPECT_EQ(orcOffset, 0);
+ writer->add(*batch);
+ writer->close();
+}
+TEST(TestAdapterWriteNumerical, writeBoolNoNulls) {
+ BooleanBuilder builder;
Review comment:
Instead of using a builder, use `ArrayFromJSON`, e.g. `ArrayFromJSON(arrow::boolean(), "[true, false]")`.
##########
File path: cpp/src/arrow/adapters/orc/adapter.h
##########
@@ -19,14 +19,38 @@
#include <cstdint>
#include <memory>
+#include <sstream>
#include <vector>
+#include "arrow/io/file.h"
#include "arrow/io/interfaces.h"
#include "arrow/memory_pool.h"
#include "arrow/record_batch.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/util/visibility.h"
+#include "orc/OrcFile.hh"
+
+namespace liborc = orc;
+
+#define ORC_THROW_NOT_OK(s) \
Review comment:
If we have these macros in headers, we should call them `ARROW_ ORC_THROW_NOT_OK`.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org