You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by GitBox <gi...@apache.org> on 2022/12/15 08:45:54 UTC

[GitHub] [doris] zhannngchen commented on a diff in pull request #14595: [enhancement](load) multipass segcompaction to reduce segments before publish (#14590)

zhannngchen commented on code in PR #14595:
URL: https://github.com/apache/doris/pull/14595#discussion_r1049280771


##########
be/src/common/config.h:
##########
@@ -838,6 +838,9 @@ CONF_Int32(segcompaction_threshold_segment_num, "10");
 // The segment whose row number above the threshold will be compacted during segcompaction
 CONF_Int32(segcompaction_small_threshold, "1048576");
 
+// Enable final-pass segcompaction before commit txn
+CONF_Bool(enable_final_pass_segcompaction, "false");

Review Comment:
   We can make it as the default behavior os segment compaction?



##########
be/src/olap/rowset/beta_rowset_writer.cpp:
##########
@@ -552,6 +575,58 @@ Status BetaRowsetWriter::_segcompaction_ramaining_if_necessary() {
     return status;
 }
 
+Status BetaRowsetWriter::_segcompaction_finalpass_wait() {
+    Status status = Status::OK();
+    DCHECK_EQ(_is_doing_segcompaction, false);
+    if (!config::enable_storage_vectorization) {
+        return Status::OK();
+    }
+    if (_segcompaction_status.load() != OLAP_SUCCESS) {
+        return Status::OLAPInternalError(OLAP_ERR_SEGCOMPACTION_FAILED);
+    }
+
+    int64_t num_seg = _is_segcompacted() ? _num_segcompacted : _num_segment;
+    if (num_seg <= FINAL_SEGCOMPACTION_NUM_THRESHOLD) {
+        LOG(INFO) << "num_seg less than max_segment_num_per_rowset, skip final pass: " << num_seg
+                  << " vs " << config::max_segment_num_per_rowset;
+        return Status::OK();
+    }
+
+    // reset num counters for the final segcompaction
+    _num_segment = _num_segcompacted.load();
+    _num_segcompacted = 0;
+    _segcompacted_point = 0;
+
+    _total_row_num_written = 0;
+    {
+        std::lock_guard<std::mutex> lock(_segid_statistics_map_mutex);
+        for (const auto& itr : _segid_statistics_map) {
+            _total_row_num_written += itr.second.row_num;
+        }
+    }
+
+    while (_segcompacted_point != _num_segment) {
+        _is_doing_segcompaction = true;

Review Comment:
   no lock protection here?



##########
be/src/olap/rowset/beta_rowset_writer.cpp:
##########
@@ -552,6 +575,58 @@ Status BetaRowsetWriter::_segcompaction_ramaining_if_necessary() {
     return status;
 }
 
+Status BetaRowsetWriter::_segcompaction_finalpass_wait() {
+    Status status = Status::OK();
+    DCHECK_EQ(_is_doing_segcompaction, false);
+    if (!config::enable_storage_vectorization) {
+        return Status::OK();
+    }
+    if (_segcompaction_status.load() != OLAP_SUCCESS) {
+        return Status::OLAPInternalError(OLAP_ERR_SEGCOMPACTION_FAILED);
+    }
+
+    int64_t num_seg = _is_segcompacted() ? _num_segcompacted : _num_segment;
+    if (num_seg <= FINAL_SEGCOMPACTION_NUM_THRESHOLD) {
+        LOG(INFO) << "num_seg less than max_segment_num_per_rowset, skip final pass: " << num_seg

Review Comment:
   How about the case that the data size is quite big, and `total_size/MAX_SEGMENT_SIZE > FINAL_SEGCOMPACTION_NUM_THRESHOLD`?



##########
be/src/olap/rowset/beta_rowset_writer.cpp:
##########
@@ -400,19 +407,22 @@ Status BetaRowsetWriter::_load_noncompacted_segments(
             return Status::OLAPInternalError(OLAP_ERR_ROWSET_LOAD_FAILED);
         }
         segments->push_back(std::move(segment));
+        if (++segment_num >= FINAL_SEGCOMPACTION_MAX_SEGMENTS) {
+            break;
+        }
     }
     return Status::OK();
 }
 
 /* policy of segcompaction target selection:
- *  1. skip big segments
+ *  1. skip (and rename to keep segment id sequential when necessary) big segments
  *  2. if the consecutive smalls end up with a big, compact the smalls, except
  *     single small
  *  3. if the consecutive smalls end up with small, compact the smalls if the
  *     length is beyond (config::segcompaction_threshold_segment_num / 2)
  */
 Status BetaRowsetWriter::_find_longest_consecutive_small_segment(
-        SegCompactionCandidatesSharedPtr segments) {
+        SegCompactionCandidatesSharedPtr segments, size_t small_threshold, bool is_greedy) {

Review Comment:
   Add a comment to explain new parameter `is _greedy`



##########
be/src/olap/rowset/beta_rowset_writer.h:
##########
@@ -127,6 +144,12 @@ class BetaRowsetWriter : public RowsetWriter {
     bool _is_segment_overlapping(const std::vector<KeyBoundsPB>& segments_encoded_key_bounds);
 
 protected:
+    // segment num below will not trigger final segcompaction
+    const uint32_t FINAL_SEGCOMPACTION_NUM_THRESHOLD = 50;

Review Comment:
   I think we can merge this 2 const variables to 1



##########
be/src/olap/rowset/beta_rowset_writer.h:
##########
@@ -32,6 +32,18 @@ class FileWriter;
 using SegCompactionCandidates = std::vector<segment_v2::SegmentSharedPtr>;
 using SegCompactionCandidatesSharedPtr = std::shared_ptr<SegCompactionCandidates>;
 
+/* A load process can be divided into: 1) Loading 2) Publishing (Committing txn)

Review Comment:
   Publishing is not equal to committing, they are different phase



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org