You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by yi...@apache.org on 2023/07/04 01:34:16 UTC
[doris] branch master updated: [improvement](memory) improve inserting sparse rows into string column (#21420)
This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 938c0765cd [improvement](memory) improve inserting sparse rows into string column (#21420)
938c0765cd is described below
commit 938c0765cdd113ad6d5218053873491f43dcd9db
Author: TengJianPing <18...@users.noreply.github.com>
AuthorDate: Tue Jul 4 09:34:10 2023 +0800
[improvement](memory) improve inserting sparse rows into string column (#21420)
For the following test, which simulate hash join outputing 435699854 rows from 5131 buiding rows:
{
auto col = doris::vectorized::ColumnString::create();
constexpr int build_rows = 5131;
constexpr int output_rows = 435699854;
std::string str("01234567");
for (int i = 0; i < build_rows; ++i) {
col->insert_data(str.data(), str.size());
}
int indices[output_rows];
for (int i = 0; i < output_rows; ++i) {
indices[i] = i % build_rows;
}
auto col2 = doris::vectorized::ColumnString::create();
doris::MonotonicStopWatch watch;
watch.start();
col2->insert_indices_from(*col, indices, indices + output_rows);
watch.stop();
LOG(WARNING) << "string column insert_indices_from, rows: " << output_rows << ", time: " << doris::PrettyPrinter::print(watch.elapsed_time(), doris::TUnit::TIME_NS);
}
The ColumnString::insert_indices_from inserting time improve from 6s665ms to 3s158ms:
W0702 23:08:39.672044 1277989 doris_main.cpp:545] string column insert_indices_from, rows: 435699854, time: 3s153ms
W0702 23:09:36.368853 1282061 doris_main.cpp:545] string column insert_indices_from, rows: 435699854, time: 3s158ms
W0703 00:30:26.093307 1468640 doris_main.cpp:545] string column insert_indices_from, rows: 435699854, time: 6s761ms
W0703 00:31:21.043638 1472937 doris_main.cpp:545] string column insert_indices_from, rows: 435699854, time: 6s665ms
---
be/src/vec/columns/column_decimal.h | 3 ---
be/src/vec/columns/column_string.cpp | 37 +++++++++++++++++++++++++++++-------
be/src/vec/columns/column_string.h | 5 -----
be/src/vec/columns/column_vector.cpp | 6 ------
4 files changed, 30 insertions(+), 21 deletions(-)
diff --git a/be/src/vec/columns/column_decimal.h b/be/src/vec/columns/column_decimal.h
index 973f0bea68..0aa0879f6b 100644
--- a/be/src/vec/columns/column_decimal.h
+++ b/be/src/vec/columns/column_decimal.h
@@ -129,9 +129,6 @@ public:
const T* src_data = reinterpret_cast<const T*>(src.get_raw_data().data);
for (int i = 0; i < new_size; ++i) {
- if (i + IColumn::PREFETCH_STEP < new_size) {
- __builtin_prefetch(&src_data[indices_begin[i + IColumn::PREFETCH_STEP]], 0, 1);
- }
data[origin_size + i] = src_data[indices_begin[i]];
}
}
diff --git a/be/src/vec/columns/column_string.cpp b/be/src/vec/columns/column_string.cpp
index e5bab2899f..3277e62894 100644
--- a/be/src/vec/columns/column_string.cpp
+++ b/be/src/vec/columns/column_string.cpp
@@ -109,14 +109,37 @@ void ColumnString::insert_range_from(const IColumn& src, size_t start, size_t le
void ColumnString::insert_indices_from(const IColumn& src, const int* indices_begin,
const int* indices_end) {
+ const ColumnString& src_str = assert_cast<const ColumnString&>(src);
+ auto src_offset_data = src_str.offsets.data();
+
+ auto old_char_size = chars.size();
+ size_t total_chars_size = old_char_size;
+
+ auto dst_offsets_pos = offsets.size();
+ offsets.resize(offsets.size() + indices_end - indices_begin);
+ auto* dst_offsets_data = offsets.data();
+
for (auto x = indices_begin; x != indices_end; ++x) {
- if (*x == -1) {
- ColumnString::insert_default();
- } else {
- if (x + IColumn::PREFETCH_STEP < indices_end && (-1 != x[IColumn::PREFETCH_STEP])) {
- ColumnString::prefetch(src, *(x + IColumn::PREFETCH_STEP));
- }
- ColumnString::insert_from(src, *x);
+ if (*x != -1) {
+ total_chars_size += src_offset_data[*x] - src_offset_data[*x - 1];
+ }
+ dst_offsets_data[dst_offsets_pos++] = total_chars_size;
+ }
+ check_chars_length(total_chars_size, offsets.size());
+
+ chars.resize(total_chars_size);
+
+ auto* src_data_ptr = src_str.chars.data();
+ auto* dst_data_ptr = chars.data();
+
+ size_t dst_chars_pos = old_char_size;
+ for (auto x = indices_begin; x != indices_end; ++x) {
+ if (*x != -1) {
+ const size_t size_to_append = src_offset_data[*x] - src_offset_data[*x - 1];
+ const size_t offset = src_offset_data[*x - 1];
+ memcpy_small_allow_read_write_overflow15(dst_data_ptr + dst_chars_pos,
+ src_data_ptr + offset, size_to_append);
+ dst_chars_pos += size_to_append;
}
}
}
diff --git a/be/src/vec/columns/column_string.h b/be/src/vec/columns/column_string.h
index 703826cd24..a8bf06c469 100644
--- a/be/src/vec/columns/column_string.h
+++ b/be/src/vec/columns/column_string.h
@@ -150,11 +150,6 @@ public:
offsets.push_back(new_size);
}
- void prefetch(const IColumn& src_, size_t n) {
- const ColumnString& src = assert_cast<const ColumnString&>(src_);
- __builtin_prefetch(&src.chars[src.offsets[n - 1]], 0, 1);
- }
-
void insert_from(const IColumn& src_, size_t n) override {
const ColumnString& src = assert_cast<const ColumnString&>(src_);
const size_t size_to_append =
diff --git a/be/src/vec/columns/column_vector.cpp b/be/src/vec/columns/column_vector.cpp
index 35534d95e7..388b436bc5 100644
--- a/be/src/vec/columns/column_vector.cpp
+++ b/be/src/vec/columns/column_vector.cpp
@@ -376,18 +376,12 @@ void ColumnVector<T>::insert_indices_from(const IColumn& src, const int* indices
if constexpr (std::is_same_v<T, UInt8>) {
// nullmap : indices_begin[i] == -1 means is null at the here, set true here
for (int i = 0; i < new_size; ++i) {
- if (i + IColumn::PREFETCH_STEP < new_size) {
- __builtin_prefetch(&src_data[indices_begin[i + IColumn::PREFETCH_STEP]], 0, 1);
- }
data[origin_size + i] = (indices_begin[i] == -1) +
(indices_begin[i] != -1) * src_data[indices_begin[i]];
}
} else {
// real data : indices_begin[i] == -1 what at is meaningless
for (int i = 0; i < new_size; ++i) {
- if (i + IColumn::PREFETCH_STEP < new_size) {
- __builtin_prefetch(&src_data[indices_begin[i + IColumn::PREFETCH_STEP]], 0, 1);
- }
data[origin_size + i] = src_data[indices_begin[i]];
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org