You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by mo...@apache.org on 2022/06/19 02:24:15 UTC
[doris] branch master updated: [improvement] Change array offset type from UInt32 to UInt64 (#10070)
This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 0e404edf54 [improvement] Change array offset type from UInt32 to UInt64 (#10070)
0e404edf54 is described below
commit 0e404edf54f11499b1697958edab761cd52dae54
Author: camby <10...@qq.com>
AuthorDate: Sun Jun 19 10:24:08 2022 +0800
[improvement] Change array offset type from UInt32 to UInt64 (#10070)
Now column `Array<T>` contains column `offsets` and `data`, and type of column `offsets` is UInt32 now.
If we call array_union to merge arrays repeatedly, the size of array may overflow.
So we need to extend it before `Array Data Type` release.
---
be/src/exec/olap_scanner.cpp | 2 +-
be/src/olap/column_vector.cpp | 6 ++--
be/src/olap/column_vector.h | 4 +--
be/src/olap/row_block2.cpp | 6 ++--
be/src/olap/rowset/segment_v2/column_reader.cpp | 2 +-
be/src/olap/rowset/segment_v2/column_reader.h | 2 +-
be/src/olap/rowset/segment_v2/column_writer.cpp | 4 +--
be/src/runtime/collection_value.cpp | 18 ++++++------
be/src/runtime/collection_value.h | 32 +++++++++++-----------
be/src/runtime/tuple.cpp | 4 +--
be/src/udf/udf.h | 4 +--
be/src/vec/columns/column.h | 2 +-
be/src/vec/columns/column_string.cpp | 4 +--
be/src/vec/columns/column_vector.cpp | 2 +-
be/src/vec/data_types/data_type_array.cpp | 4 +--
be/src/vec/data_types/data_type_string.cpp | 18 ++++++------
be/src/vec/functions/array/function_array_size.h | 2 +-
be/src/vec/functions/function_string.h | 2 +-
be/src/vec/sink/mysql_result_writer.cpp | 4 +--
be/src/vec/utils/arrow_column_to_doris_column.cpp | 8 +++---
be/test/olap/column_vector_test.cpp | 10 +++----
.../org/apache/doris/catalog/PrimitiveType.java | 3 +-
22 files changed, 72 insertions(+), 71 deletions(-)
diff --git a/be/src/exec/olap_scanner.cpp b/be/src/exec/olap_scanner.cpp
index 69d23f5219..3bd2783ea0 100644
--- a/be/src/exec/olap_scanner.cpp
+++ b/be/src/exec/olap_scanner.cpp
@@ -395,7 +395,7 @@ Status OlapScanner::get_batch(RuntimeState* state, RowBatch* batch, bool* eof) {
auto pool = batch->tuple_data_pool();
CollectionValue::deep_copy_collection(
slot, item_type,
- [pool](int size) -> MemFootprint {
+ [pool](int64_t size) -> MemFootprint {
int64_t offset = pool->total_allocated_bytes();
uint8_t* data = pool->allocate(size);
return {offset, data};
diff --git a/be/src/olap/column_vector.cpp b/be/src/olap/column_vector.cpp
index 41e91b9d63..c4962926f2 100644
--- a/be/src/olap/column_vector.cpp
+++ b/be/src/olap/column_vector.cpp
@@ -144,13 +144,13 @@ Status ColumnVectorBatch::create(size_t init_capacity, bool is_nullable, const T
array_type_info->item_type_info(), field->get_sub_field(0), &elements));
std::unique_ptr<ColumnVectorBatch> offsets;
- const auto* offsets_type_info = get_scalar_type_info<OLAP_FIELD_TYPE_UNSIGNED_INT>();
+ const auto* offsets_type_info = get_scalar_type_info<OLAP_FIELD_TYPE_UNSIGNED_BIGINT>();
RETURN_IF_ERROR(ColumnVectorBatch::create(init_capacity + 1, false, offsets_type_info,
nullptr, &offsets));
std::unique_ptr<ColumnVectorBatch> local(new ArrayColumnVectorBatch(
type_info, is_nullable,
- reinterpret_cast<ScalarColumnVectorBatch<uint32_t>*>(offsets.release()),
+ reinterpret_cast<ScalarColumnVectorBatch<uint64_t>*>(offsets.release()),
elements.release()));
RETURN_IF_ERROR(local->resize(init_capacity));
*column_vector_batch = std::move(local);
@@ -181,7 +181,7 @@ Status ScalarColumnVectorBatch<ScalarType>::resize(size_t new_cap) {
}
ArrayColumnVectorBatch::ArrayColumnVectorBatch(const TypeInfo* type_info, bool is_nullable,
- ScalarColumnVectorBatch<uint32_t>* offsets,
+ ScalarColumnVectorBatch<uint64_t>* offsets,
ColumnVectorBatch* elements)
: ColumnVectorBatch(type_info, is_nullable), _data(0) {
_offsets.reset(offsets);
diff --git a/be/src/olap/column_vector.h b/be/src/olap/column_vector.h
index 70ee6e3041..28139fe7d2 100644
--- a/be/src/olap/column_vector.h
+++ b/be/src/olap/column_vector.h
@@ -178,7 +178,7 @@ private:
class ArrayColumnVectorBatch : public ColumnVectorBatch {
public:
explicit ArrayColumnVectorBatch(const TypeInfo* type_info, bool is_nullable,
- ScalarColumnVectorBatch<uint32_t>* offsets,
+ ScalarColumnVectorBatch<uint64_t>* offsets,
ColumnVectorBatch* elements);
~ArrayColumnVectorBatch() override;
Status resize(size_t new_cap) override;
@@ -249,7 +249,7 @@ private:
std::unique_ptr<ColumnVectorBatch> _elements;
// Stores each array's start offsets in _elements.
- std::unique_ptr<ScalarColumnVectorBatch<uint32_t>> _offsets;
+ std::unique_ptr<ScalarColumnVectorBatch<uint64_t>> _offsets;
};
} // namespace doris
diff --git a/be/src/olap/row_block2.cpp b/be/src/olap/row_block2.cpp
index fd4b0ef23e..947bdd4055 100644
--- a/be/src/olap/row_block2.cpp
+++ b/be/src/olap/row_block2.cpp
@@ -289,7 +289,7 @@ Status RowBlockV2::_copy_data_to_column(int cid,
auto& offsets_col = column_array->get_offsets();
offsets_col.reserve(_selected_size);
- uint32_t offset = offsets_col.back();
+ uint64_t offset = offsets_col.back();
for (uint16_t j = 0; j < _selected_size; ++j) {
uint16_t row_idx = _selection_vector[j];
auto cv = reinterpret_cast<const CollectionValue*>(column_block(cid).cell_ptr(row_idx));
@@ -550,10 +550,10 @@ Status RowBlockV2::_append_data_to_column(const ColumnVectorBatch* batch, size_t
auto nested_col = (*column_array->get_data_ptr()).assume_mutable();
auto& offsets_col = column_array->get_offsets();
- uint32_t offset = offsets_col.back();
+ auto offset = offsets_col.back();
for (uint32_t j = 0; j < selected_size; ++j) {
if (!nullable_mark_array[j]) {
- uint32_t row_idx = j + start;
+ uint64_t row_idx = j + start;
auto cv = reinterpret_cast<const CollectionValue*>(batch->cell_ptr(row_idx));
offset += cv->length();
_append_data_to_column(array_batch->elements(), array_batch->item_offset(row_idx),
diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp b/be/src/olap/rowset/segment_v2/column_reader.cpp
index d68ffc66c0..fe6cd94a82 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/column_reader.cpp
@@ -396,7 +396,7 @@ Status ArrayFileColumnIterator::init(const ColumnIteratorOptions& opts) {
if (_array_reader->is_nullable()) {
RETURN_IF_ERROR(_null_iterator->init(opts));
}
- const auto* offset_type_info = get_scalar_type_info<OLAP_FIELD_TYPE_UNSIGNED_INT>();
+ const auto* offset_type_info = get_scalar_type_info<OLAP_FIELD_TYPE_UNSIGNED_BIGINT>();
RETURN_IF_ERROR(
ColumnVectorBatch::create(1024, false, offset_type_info, nullptr, &_length_batch));
return Status::OK();
diff --git a/be/src/olap/rowset/segment_v2/column_reader.h b/be/src/olap/rowset/segment_v2/column_reader.h
index 47250a96cb..a365679ee7 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.h
+++ b/be/src/olap/rowset/segment_v2/column_reader.h
@@ -369,7 +369,7 @@ public:
: size_to_read;
ColumnBlockView ordinal_view(&ordinal_block);
RETURN_IF_ERROR(_length_iterator->next_batch(&this_read, &ordinal_view, &has_null));
- auto* ordinals = reinterpret_cast<uint32_t*>(_length_batch->data());
+ auto* ordinals = reinterpret_cast<uint64_t*>(_length_batch->data());
for (int i = 0; i < this_read; ++i) {
item_ordinal += ordinals[i];
}
diff --git a/be/src/olap/rowset/segment_v2/column_writer.cpp b/be/src/olap/rowset/segment_v2/column_writer.cpp
index 6c76ddff62..c96ccd9a61 100644
--- a/be/src/olap/rowset/segment_v2/column_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/column_writer.cpp
@@ -110,7 +110,7 @@ Status ColumnWriter::create(const ColumnWriterOptions& opts, const TabletColumn*
ColumnWriter::create(item_options, &item_column, _wblock, &item_writer));
// create length writer
- FieldType length_type = FieldType::OLAP_FIELD_TYPE_UNSIGNED_INT;
+ FieldType length_type = FieldType::OLAP_FIELD_TYPE_UNSIGNED_BIGINT;
ColumnWriterOptions length_options;
length_options.meta = opts.meta->add_children_columns();
@@ -119,7 +119,7 @@ Status ColumnWriter::create(const ColumnWriterOptions& opts, const TabletColumn*
length_options.meta->set_type(length_type);
length_options.meta->set_is_nullable(false);
length_options.meta->set_length(
- get_scalar_type_info<OLAP_FIELD_TYPE_UNSIGNED_INT>()->size());
+ get_scalar_type_info<OLAP_FIELD_TYPE_UNSIGNED_BIGINT>()->size());
length_options.meta->set_encoding(DEFAULT_ENCODING);
length_options.meta->set_compression(opts.meta->compression());
diff --git a/be/src/runtime/collection_value.cpp b/be/src/runtime/collection_value.cpp
index 9ea8fc3d43..d8581d8ac5 100644
--- a/be/src/runtime/collection_value.cpp
+++ b/be/src/runtime/collection_value.cpp
@@ -186,7 +186,7 @@ struct ArrayIteratorFunctionsForString : public GenericArrayIteratorFunctions<ty
static void deserialize(void* item, const char* tuple_data, const TypeDescriptor& type_desc) {
auto* string_value = static_cast<CppType*>(item);
if (string_value->len) {
- int offset = convert_to<int>(string_value->ptr);
+ int64_t offset = convert_to<int64_t>(string_value->ptr);
string_value->ptr = convert_to<char*>(tuple_data + offset);
}
}
@@ -448,7 +448,7 @@ size_t CollectionValue::get_byte_size(const TypeDescriptor& item_type) const {
return result;
}
-Status CollectionValue::init_collection(ObjectPool* pool, uint32_t size, PrimitiveType child_type,
+Status CollectionValue::init_collection(ObjectPool* pool, uint64_t size, PrimitiveType child_type,
CollectionValue* value) {
return init_collection(
value, [pool](size_t size) -> uint8_t* { return pool->add_array(new uint8_t[size]); },
@@ -456,7 +456,7 @@ Status CollectionValue::init_collection(ObjectPool* pool, uint32_t size, Primiti
}
Status CollectionValue::init_collection(CollectionValue* value, const AllocateMemFunc& allocate,
- uint32_t size, PrimitiveType child_type) {
+ uint64_t size, PrimitiveType child_type) {
if (value == nullptr) {
return Status::InvalidArgument("collection value is null");
}
@@ -477,13 +477,13 @@ Status CollectionValue::init_collection(CollectionValue* value, const AllocateMe
return Status::OK();
}
-Status CollectionValue::init_collection(MemPool* pool, uint32_t size, PrimitiveType child_type,
+Status CollectionValue::init_collection(MemPool* pool, uint64_t size, PrimitiveType child_type,
CollectionValue* value) {
return init_collection(
value, [pool](size_t size) { return pool->allocate(size); }, size, child_type);
}
-Status CollectionValue::init_collection(FunctionContext* context, uint32_t size,
+Status CollectionValue::init_collection(FunctionContext* context, uint64_t size,
PrimitiveType child_type, CollectionValue* value) {
return init_collection(
value, [context](size_t size) { return context->allocate(size); }, size, child_type);
@@ -506,8 +506,8 @@ void CollectionValue::deep_copy_collection(CollectionValue* shallow_copied_cv,
}
auto iterator = cv->iterator(item_type.type);
- int coll_byte_size = cv->length() * iterator.type_size();
- int nulls_size = cv->has_null() ? cv->length() * sizeof(bool) : 0;
+ uint64_t coll_byte_size = cv->length() * iterator.type_size();
+ uint64_t nulls_size = cv->has_null() ? cv->length() * sizeof(bool) : 0;
MemFootprint footprint = gen_mem_footprint(coll_byte_size + nulls_size);
int64_t offset = footprint.first;
@@ -544,10 +544,10 @@ void CollectionValue::deserialize_collection(CollectionValue* cv, const char* tu
return;
}
// assgin data and null_sign pointer position in tuple_data
- int data_offset = convert_to<int>(cv->data());
+ int64_t data_offset = convert_to<int64_t>(cv->data());
cv->set_data(convert_to<char*>(tuple_data + data_offset));
if (cv->has_null()) {
- int null_offset = convert_to<int>(cv->null_signs());
+ int64_t null_offset = convert_to<int64_t>(cv->null_signs());
cv->set_null_signs(convert_to<bool*>(tuple_data + null_offset));
}
auto iterator = cv->iterator(item_type.type);
diff --git a/be/src/runtime/collection_value.h b/be/src/runtime/collection_value.h
index ccf623171f..3fac161503 100644
--- a/be/src/runtime/collection_value.h
+++ b/be/src/runtime/collection_value.h
@@ -32,7 +32,7 @@ using doris_udf::FunctionContext;
using doris_udf::AnyVal;
using MemFootprint = std::pair<int64_t, uint8_t*>;
-using GenMemFootprintFunc = std::function<MemFootprint(int size)>;
+using GenMemFootprintFunc = std::function<MemFootprint(int64_t size)>;
struct ArrayIteratorFunctionsBase;
class ArrayIterator;
@@ -64,25 +64,25 @@ class CollectionValue {
public:
CollectionValue() = default;
- explicit CollectionValue(uint32_t length)
+ explicit CollectionValue(uint64_t length)
: _data(nullptr), _length(length), _has_null(false), _null_signs(nullptr) {}
- CollectionValue(void* data, uint32_t length)
+ CollectionValue(void* data, uint64_t length)
: _data(data), _length(length), _has_null(false), _null_signs(nullptr) {}
- CollectionValue(void* data, uint32_t length, bool* null_signs)
+ CollectionValue(void* data, uint64_t length, bool* null_signs)
: _data(data), _length(length), _has_null(true), _null_signs(null_signs) {}
- CollectionValue(void* data, uint32_t length, bool has_null, bool* null_signs)
+ CollectionValue(void* data, uint64_t length, bool has_null, bool* null_signs)
: _data(data), _length(length), _has_null(has_null), _null_signs(null_signs) {}
- bool is_null_at(uint32_t index) const { return this->_has_null && this->_null_signs[index]; }
+ bool is_null_at(uint64_t index) const { return this->_has_null && this->_null_signs[index]; }
void to_collection_val(CollectionVal* val) const;
- uint32_t size() const { return _length; }
+ uint64_t size() const { return _length; }
- uint32_t length() const { return _length; }
+ uint64_t length() const { return _length; }
void shallow_copy(const CollectionValue* other);
@@ -96,13 +96,13 @@ public:
/**
* init collection, will alloc (children Type's size + 1) * (children Nums) memory
*/
- static Status init_collection(ObjectPool* pool, uint32_t size, PrimitiveType child_type,
+ static Status init_collection(ObjectPool* pool, uint64_t size, PrimitiveType child_type,
CollectionValue* value);
- static Status init_collection(MemPool* pool, uint32_t size, PrimitiveType child_type,
+ static Status init_collection(MemPool* pool, uint64_t size, PrimitiveType child_type,
CollectionValue* value);
- static Status init_collection(FunctionContext* context, uint32_t size, PrimitiveType child_type,
+ static Status init_collection(FunctionContext* context, uint64_t size, PrimitiveType child_type,
CollectionValue* value);
static CollectionValue from_collection_val(const CollectionVal& val);
@@ -123,7 +123,7 @@ public:
const bool* null_signs() const { return _null_signs; }
void* mutable_data() { return _data; }
bool* mutable_null_signs() { return _null_signs; }
- void set_length(uint32_t length) { _length = length; }
+ void set_length(uint64_t length) { _length = length; }
void set_has_null(bool has_null) { _has_null = has_null; }
void set_data(void* data) { _data = data; }
void set_null_signs(bool* null_signs) { _null_signs = null_signs; }
@@ -131,13 +131,13 @@ public:
private:
using AllocateMemFunc = std::function<uint8_t*(size_t size)>;
static Status init_collection(CollectionValue* value, const AllocateMemFunc& allocate,
- uint32_t size, PrimitiveType child_type);
+ uint64_t size, PrimitiveType child_type);
ArrayIterator internal_iterator(PrimitiveType child_type) const;
private:
// child column data
void* _data;
- uint32_t _length;
+ uint64_t _length;
// item has no null value if has_null is false.
// item ```may``` has null value if has_null is true.
bool _has_null;
@@ -160,7 +160,7 @@ public:
}
return false;
}
- bool seek(uint32_t n) const {
+ bool seek(uint64_t n) const {
if (n >= _collection_value->size()) {
return false;
}
@@ -248,7 +248,7 @@ private:
private:
CollectionValue* _collection_value;
- mutable uint32_t _offset;
+ mutable uint64_t _offset;
const int _type_size;
const bool _is_type_fixed_width;
diff --git a/be/src/runtime/tuple.cpp b/be/src/runtime/tuple.cpp
index 92ee49ac0b..641c13c4bb 100644
--- a/be/src/runtime/tuple.cpp
+++ b/be/src/runtime/tuple.cpp
@@ -93,7 +93,7 @@ void Tuple::deep_copy(Tuple* dst, const TupleDescriptor& desc, MemPool* pool, bo
// copy collection slot
deep_copy_collection_slots(
dst, desc,
- [pool](int size) -> MemFootprint {
+ [pool](int64_t size) -> MemFootprint {
int64_t offset = pool->total_allocated_bytes();
uint8_t* data = pool->allocate(size);
return {offset, data};
@@ -186,7 +186,7 @@ void Tuple::deep_copy(const TupleDescriptor& desc, char** data, int64_t* offset,
// copy collection slots
deep_copy_collection_slots(
dst, desc,
- [offset, data](int size) -> MemFootprint {
+ [offset, data](int64_t size) -> MemFootprint {
MemFootprint footprint = {*offset, reinterpret_cast<uint8_t*>(*data)};
*offset += size;
*data += size;
diff --git a/be/src/udf/udf.h b/be/src/udf/udf.h
index 56e447f375..324a1d362a 100644
--- a/be/src/udf/udf.h
+++ b/be/src/udf/udf.h
@@ -742,7 +742,7 @@ struct HllVal : public StringVal {
struct CollectionVal : public AnyVal {
void* data;
- uint32_t length;
+ uint64_t length;
// item has no null value if has_null is false.
// item ```may``` has null value if has_null is true.
bool has_null;
@@ -751,7 +751,7 @@ struct CollectionVal : public AnyVal {
CollectionVal() = default;
- CollectionVal(void* data, uint32_t length, bool has_null, bool* null_signs)
+ CollectionVal(void* data, uint64_t length, bool has_null, bool* null_signs)
: data(data), length(length), has_null(has_null), null_signs(null_signs) {};
static CollectionVal null() {
diff --git a/be/src/vec/columns/column.h b/be/src/vec/columns/column.h
index b0b6231cbb..77c61a4153 100644
--- a/be/src/vec/columns/column.h
+++ b/be/src/vec/columns/column.h
@@ -306,7 +306,7 @@ public:
* (i-th element should be copied offsets[i] - offsets[i - 1] times.)
* It is necessary in ARRAY JOIN operation.
*/
- using Offset = UInt32;
+ using Offset = UInt64;
using Offsets = PaddedPODArray<Offset>;
virtual Ptr replicate(const Offsets& offsets) const = 0;
diff --git a/be/src/vec/columns/column_string.cpp b/be/src/vec/columns/column_string.cpp
index c469d79190..12701ab5ed 100644
--- a/be/src/vec/columns/column_string.cpp
+++ b/be/src/vec/columns/column_string.cpp
@@ -164,7 +164,7 @@ ColumnPtr ColumnString::permute(const Permutation& perm, size_t limit) const {
StringRef ColumnString::serialize_value_into_arena(size_t n, Arena& arena,
char const*& begin) const {
- UInt32 string_size = size_at(n);
+ IColumn::Offset string_size = size_at(n);
size_t offset = offset_at(n);
StringRef res;
@@ -178,7 +178,7 @@ StringRef ColumnString::serialize_value_into_arena(size_t n, Arena& arena,
}
const char* ColumnString::deserialize_and_insert_from_arena(const char* pos) {
- const UInt32 string_size = unaligned_load<UInt32>(pos);
+ const IColumn::Offset string_size = unaligned_load<IColumn::Offset>(pos);
pos += sizeof(string_size);
const size_t old_size = chars.size();
diff --git a/be/src/vec/columns/column_vector.cpp b/be/src/vec/columns/column_vector.cpp
index acc7e4b9a2..dde2f033a7 100644
--- a/be/src/vec/columns/column_vector.cpp
+++ b/be/src/vec/columns/column_vector.cpp
@@ -327,7 +327,7 @@ ColumnPtr ColumnVector<T>::replicate(const IColumn::Offsets& offsets) const {
// vectorized this code to speed up
IColumn::Offset counts[size];
- for (size_t i = 0; i < size; ++i) {
+ for (ssize_t i = 0; i < size; ++i) {
counts[i] = offsets[i] - offsets[i - 1];
}
diff --git a/be/src/vec/data_types/data_type_array.cpp b/be/src/vec/data_types/data_type_array.cpp
index cc67eb7973..e39f1c569b 100644
--- a/be/src/vec/data_types/data_type_array.cpp
+++ b/be/src/vec/data_types/data_type_array.cpp
@@ -65,7 +65,7 @@ char* DataTypeArray::serialize(const IColumn& column, char* buf) const {
const auto& data_column = assert_cast<const ColumnArray&>(*ptr.get());
// row num
- *reinterpret_cast<uint32_t*>(buf) = column.size();
+ *reinterpret_cast<IColumn::Offset*>(buf) = column.size();
buf += sizeof(IColumn::Offset);
// offsets
memcpy(buf, data_column.get_offsets().data(), column.size() * sizeof(IColumn::Offset));
@@ -79,7 +79,7 @@ const char* DataTypeArray::deserialize(const char* buf, IColumn* column) const {
auto& offsets = data_column->get_offsets();
// row num
- uint32_t row_num = *reinterpret_cast<const IColumn::Offset*>(buf);
+ IColumn::Offset row_num = *reinterpret_cast<const IColumn::Offset*>(buf);
buf += sizeof(IColumn::Offset);
// offsets
offsets.resize(row_num);
diff --git a/be/src/vec/data_types/data_type_string.cpp b/be/src/vec/data_types/data_type_string.cpp
index 220b418e4b..b7bf2fdc2e 100644
--- a/be/src/vec/data_types/data_type_string.cpp
+++ b/be/src/vec/data_types/data_type_string.cpp
@@ -90,7 +90,7 @@ bool DataTypeString::equals(const IDataType& rhs) const {
int64_t DataTypeString::get_uncompressed_serialized_bytes(const IColumn& column) const {
auto ptr = column.convert_to_full_column_if_const();
const auto& data_column = assert_cast<const ColumnString&>(*ptr.get());
- return sizeof(uint32_t) * (column.size() + 1) + sizeof(uint64_t) +
+ return sizeof(IColumn::Offset) * (column.size() + 1) + sizeof(uint64_t) +
data_column.get_chars().size();
}
@@ -99,11 +99,11 @@ char* DataTypeString::serialize(const IColumn& column, char* buf) const {
const auto& data_column = assert_cast<const ColumnString&>(*ptr.get());
// row num
- *reinterpret_cast<uint32_t*>(buf) = column.size();
- buf += sizeof(uint32_t);
+ *reinterpret_cast<IColumn::Offset*>(buf) = column.size();
+ buf += sizeof(IColumn::Offset);
// offsets
- memcpy(buf, data_column.get_offsets().data(), column.size() * sizeof(uint32_t));
- buf += column.size() * sizeof(uint32_t);
+ memcpy(buf, data_column.get_offsets().data(), column.size() * sizeof(IColumn::Offset));
+ buf += column.size() * sizeof(IColumn::Offset);
// total length
uint64_t value_len = data_column.get_chars().size();
*reinterpret_cast<uint64_t*>(buf) = value_len;
@@ -121,12 +121,12 @@ const char* DataTypeString::deserialize(const char* buf, IColumn* column) const
ColumnString::Offsets& offsets = column_string->get_offsets();
// row num
- uint32_t row_num = *reinterpret_cast<const uint32_t*>(buf);
- buf += sizeof(uint32_t);
+ IColumn::Offset row_num = *reinterpret_cast<const IColumn::Offset*>(buf);
+ buf += sizeof(IColumn::Offset);
// offsets
offsets.resize(row_num);
- memcpy(offsets.data(), buf, sizeof(uint32_t) * row_num);
- buf += sizeof(uint32_t) * row_num;
+ memcpy(offsets.data(), buf, sizeof(IColumn::Offset) * row_num);
+ buf += sizeof(IColumn::Offset) * row_num;
// total length
uint64_t value_len = *reinterpret_cast<const uint64_t*>(buf);
buf += sizeof(uint64_t);
diff --git a/be/src/vec/functions/array/function_array_size.h b/be/src/vec/functions/array/function_array_size.h
index bffed4460d..1988c5b66a 100644
--- a/be/src/vec/functions/array/function_array_size.h
+++ b/be/src/vec/functions/array/function_array_size.h
@@ -59,7 +59,7 @@ public:
auto dst_column = ColumnInt64::create(input_rows_count);
auto& dst_data = dst_column->get_data();
- for (size_t i = 0; i < offsets.size(); ++i) {
+ for (ssize_t i = 0; i < offsets.size(); ++i) {
dst_data[i] = offsets[i] - offsets[i - 1];
}
diff --git a/be/src/vec/functions/function_string.h b/be/src/vec/functions/function_string.h
index 3027c121d9..3c29cb8745 100644
--- a/be/src/vec/functions/function_string.h
+++ b/be/src/vec/functions/function_string.h
@@ -635,7 +635,7 @@ public:
//
fmt::memory_buffer buffer;
res_offsets.resize(input_row_size);
- for (size_t i = 0; i < input_row_size; ++i) {
+ for (ssize_t i = 0; i < input_row_size; ++i) {
buffer.clear();
const char* raw_str = reinterpret_cast<const char*>(&data[offsets[i - 1]]);
int size = offsets[i] - offsets[i - 1] - 1;
diff --git a/be/src/vec/sink/mysql_result_writer.cpp b/be/src/vec/sink/mysql_result_writer.cpp
index 55fa8aaf41..1902bf9972 100644
--- a/be/src/vec/sink/mysql_result_writer.cpp
+++ b/be/src/vec/sink/mysql_result_writer.cpp
@@ -111,7 +111,7 @@ Status VMysqlResultWriter::_add_one_column(const ColumnPtr& column_ptr,
} else if constexpr (type == TYPE_ARRAY) {
auto& column_array = assert_cast<const ColumnArray&>(*column);
auto& offsets = column_array.get_offsets();
- for (int i = 0; i < row_size; ++i) {
+ for (ssize_t i = 0; i < row_size; ++i) {
if (0 != buf_ret) {
return Status::InternalError("pack mysql buffer failed.");
}
@@ -128,7 +128,7 @@ Status VMysqlResultWriter::_add_one_column(const ColumnPtr& column_ptr,
_buffer.open_dynamic_mode();
buf_ret = _buffer.push_string("[", 1);
bool begin = true;
- for (int j = offsets[i - 1]; j < offsets[i]; ++j) {
+ for (auto j = offsets[i - 1]; j < offsets[i]; ++j) {
if (!begin) {
buf_ret = _buffer.push_string(", ", 2);
}
diff --git a/be/src/vec/utils/arrow_column_to_doris_column.cpp b/be/src/vec/utils/arrow_column_to_doris_column.cpp
index 206c279e4c..76741fdb59 100644
--- a/be/src/vec/utils/arrow_column_to_doris_column.cpp
+++ b/be/src/vec/utils/arrow_column_to_doris_column.cpp
@@ -100,8 +100,8 @@ static size_t fill_nullable_column(const arrow::Array* array, size_t array_idx,
/// Also internal strings are null terminated.
static Status convert_column_with_string_data(const arrow::Array* array, size_t array_idx,
MutableColumnPtr& data_column, size_t num_elements) {
- PaddedPODArray<UInt8>& column_chars_t = assert_cast<ColumnString&>(*data_column).get_chars();
- PaddedPODArray<UInt32>& column_offsets = assert_cast<ColumnString&>(*data_column).get_offsets();
+ auto& column_chars_t = assert_cast<ColumnString&>(*data_column).get_chars();
+ auto& column_offsets = assert_cast<ColumnString&>(*data_column).get_offsets();
auto concrete_array = down_cast<const arrow::BinaryArray*>(array);
std::shared_ptr<arrow::Buffer> buffer = concrete_array->value_data();
@@ -121,8 +121,8 @@ static Status convert_column_with_string_data(const arrow::Array* array, size_t
static Status convert_column_with_fixed_size_data(const arrow::Array* array, size_t array_idx,
MutableColumnPtr& data_column,
size_t num_elements) {
- PaddedPODArray<UInt8>& column_chars_t = assert_cast<ColumnString&>(*data_column).get_chars();
- PaddedPODArray<UInt32>& column_offsets = assert_cast<ColumnString&>(*data_column).get_offsets();
+ auto& column_chars_t = assert_cast<ColumnString&>(*data_column).get_chars();
+ auto& column_offsets = assert_cast<ColumnString&>(*data_column).get_offsets();
auto concrete_array = down_cast<const arrow::FixedSizeBinaryArray*>(array);
uint32_t width = concrete_array->byte_width();
diff --git a/be/test/olap/column_vector_test.cpp b/be/test/olap/column_vector_test.cpp
index 5faffb041d..19a28088d8 100644
--- a/be/test/olap/column_vector_test.cpp
+++ b/be/test/olap/column_vector_test.cpp
@@ -95,8 +95,8 @@ void test_read_write_array_column_vector(const TypeInfo* array_type_info, size_t
// first write
for (size_t i = 0; i < array_init_size; ++i) {
- uint32_t len = result[i].length();
- memcpy(offset_cvb->mutable_cell_ptr(1 + i), &len, sizeof(uint32_t));
+ uint64_t len = result[i].length();
+ memcpy(offset_cvb->mutable_cell_ptr(1 + i), &len, sizeof(uint64_t));
}
array_cvb->set_null_bits(0, array_init_size, false);
array_cvb->get_offset_by_length(0, array_init_size);
@@ -114,8 +114,8 @@ void test_read_write_array_column_vector(const TypeInfo* array_type_info, size_t
// second write
EXPECT_TRUE(array_cvb->resize(array_size).ok());
for (int i = array_init_size; i < array_size; ++i) {
- uint32_t len = result[i].length();
- memcpy(offset_cvb->mutable_cell_ptr(i + 1), &len, sizeof(uint32_t));
+ uint64_t len = result[i].length();
+ memcpy(offset_cvb->mutable_cell_ptr(i + 1), &len, sizeof(uint64_t));
}
array_cvb->set_null_bits(array_init_size, array_size - array_init_size, false);
array_cvb->get_offset_by_length(array_init_size, array_size - array_init_size);
@@ -170,7 +170,7 @@ TEST_F(ColumnVectorTest, array_column_vector_test) {
auto* item_val = new uint8_t[num_item];
memset(null_signs, 0, sizeof(bool) * 3);
- for (int i = 0; i < num_item; ++i) {
+ for (size_t i = 0; i < num_item; ++i) {
item_val[i] = i;
if (i % 3 == 0) {
size_t array_index = i / 3;
diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/PrimitiveType.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/PrimitiveType.java
index 4c5c4c4c6f..32dc2bec55 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/catalog/PrimitiveType.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/PrimitiveType.java
@@ -55,7 +55,8 @@ public enum PrimitiveType {
BITMAP("BITMAP", 16, TPrimitiveType.OBJECT),
QUANTILE_STATE("QUANTILE_STATE", 16, TPrimitiveType.QUANTILE_STATE),
- ARRAY("ARRAY", 24, TPrimitiveType.ARRAY),
+ // sizeof(CollectionValue)
+ ARRAY("ARRAY", 32, TPrimitiveType.ARRAY),
MAP("MAP", 24, TPrimitiveType.MAP),
STRUCT("STRUCT", 24, TPrimitiveType.STRUCT),
STRING("STRING", 16, TPrimitiveType.STRING),
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org