You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by la...@apache.org on 2023/01/17 04:33:58 UTC

[kudu] branch master updated: [tools] Return immediately after the limit number of rows have been dumped

This is an automated email from the ASF dual-hosted git repository.

laiyingchun pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git


The following commit(s) were added to refs/heads/master by this push:
     new a1f13f934 [tools] Return immediately after the limit number of rows have been dumped
a1f13f934 is described below

commit a1f13f93413e23674b11b9a308e69c95ef6ce97c
Author: Yingchun Lai <la...@apache.org>
AuthorDate: Sat Dec 17 16:55:51 2022 +0800

    [tools] Return immediately after the limit number of rows have been dumped
    
    The --nrows flag for 'local_replica dump rowset'
    just limits the last output row count, but doesn't
    take effect on the row count dumped from rowsets
    internally of the tool, due to which there maybe
    too much useless work.
    
    This patch improves it by returning immediately
    after --nrows of rows have been dumped..
    
    Change-Id: Ia758ba910fccbbc06ac6c59a795574fb86d4e279
    Reviewed-on: http://gerrit.cloudera.org:8080/19369
    Tested-by: Kudu Jenkins
    Reviewed-by: Wang Xixu <14...@qq.com>
    Reviewed-by: Alexey Serbin <al...@apache.org>
---
 src/kudu/tablet/compaction-test.cc          |  4 +--
 src/kudu/tablet/compaction.cc               |  8 +++++-
 src/kudu/tablet/compaction.h                |  8 ++++--
 src/kudu/tablet/diskrowset.cc               |  4 +--
 src/kudu/tablet/diskrowset.h                | 21 ++++++++++++--
 src/kudu/tablet/memrowset.cc                |  5 +++-
 src/kudu/tablet/memrowset.h                 |  9 ++----
 src/kudu/tablet/mock-rowsets.h              |  2 +-
 src/kudu/tablet/rowset.cc                   | 10 +++++--
 src/kudu/tablet/rowset.h                    | 17 +++++++----
 src/kudu/tools/kudu-tool-test.cc            | 21 ++++++++++++++
 src/kudu/tools/tool_action_local_replica.cc | 44 ++++++++++++++++++-----------
 12 files changed, 111 insertions(+), 42 deletions(-)

diff --git a/src/kudu/tablet/compaction-test.cc b/src/kudu/tablet/compaction-test.cc
index d00960f67..e23b3083d 100644
--- a/src/kudu/tablet/compaction-test.cc
+++ b/src/kudu/tablet/compaction-test.cc
@@ -301,8 +301,8 @@ class TestCompaction : public KuduRowSetTest {
 
   // Iterate over the given compaction input, stringifying and dumping each
   // yielded row to *out
-  void IterateInput(CompactionInput* input, vector<string>* out) {
-    ASSERT_OK(DebugDumpCompactionInput(input, out));
+  static void IterateInput(CompactionInput* input, vector<string>* out) {
+    ASSERT_OK(DebugDumpCompactionInput(input, nullptr, out));
   }
 
   // Flush the given CompactionInput 'input' to disk with the given snapshot.
diff --git a/src/kudu/tablet/compaction.cc b/src/kudu/tablet/compaction.cc
index 61c39c59d..b6a31e86c 100644
--- a/src/kudu/tablet/compaction.cc
+++ b/src/kudu/tablet/compaction.cc
@@ -1639,14 +1639,20 @@ Status ReupdateMissedDeltas(const IOContext* io_context,
 }
 
 
-Status DebugDumpCompactionInput(CompactionInput* input, vector<string>* lines) {
+Status DebugDumpCompactionInput(CompactionInput* input, int64_t* rows_left, vector<string>* lines) {
   RETURN_NOT_OK(input->Init());
   vector<CompactionInputRow> rows;
 
   while (input->HasMoreBlocks()) {
+    if (rows_left && *rows_left <= 0) {
+      break;
+    }
     RETURN_NOT_OK(input->PrepareBlock(&rows));
 
     for (const auto& input_row : rows) {
+      if (rows_left && (*rows_left)-- <= 0) {
+        break;
+      }
       LOG_STRING(INFO, lines) << CompactionInputRowToString(input_row);
     }
 
diff --git a/src/kudu/tablet/compaction.h b/src/kudu/tablet/compaction.h
index 4ea0f791a..967fc70b9 100644
--- a/src/kudu/tablet/compaction.h
+++ b/src/kudu/tablet/compaction.h
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cstddef>
+#include <cstdint>
 #include <memory>
 #include <mutex>
 #include <string>
@@ -256,8 +257,11 @@ Status ReupdateMissedDeltas(const fs::IOContext* io_context,
                             const RowSetVector& output_rowsets);
 
 // Dump the given compaction input to 'lines' or LOG(INFO) if it is NULL.
-// This consumes all of the input in the compaction input.
-Status DebugDumpCompactionInput(CompactionInput* input, std::vector<std::string>* lines);
+// This consumes no more rows from the compaction input than specified by the 'rows_left' parameter.
+// If 'rows_left' is nullptr, there is no limit on the number of rows to dump.
+// If the content of 'rows_left' is equal to or less than 0, no rows will be dumped.
+Status DebugDumpCompactionInput(CompactionInput* input, int64_t* rows_left,
+                                std::vector<std::string>* lines);
 
 // Helper methods to print a row with full history.
 std::string RowToString(const RowBlockRow& row,
diff --git a/src/kudu/tablet/diskrowset.cc b/src/kudu/tablet/diskrowset.cc
index f4756ce7f..c9d03a2a9 100644
--- a/src/kudu/tablet/diskrowset.cc
+++ b/src/kudu/tablet/diskrowset.cc
@@ -905,14 +905,14 @@ Status DiskRowSet::DeleteAncientUndoDeltas(Timestamp ancient_history_mark,
                                                  blocks_deleted, bytes_deleted);
 }
 
-Status DiskRowSet::DebugDump(vector<string> *lines) {
+Status DiskRowSet::DebugDumpImpl(int64_t* rows_left, vector<string>* lines) {
   // Using CompactionInput to dump our data is an easy way of seeing all the
   // rows and deltas.
   unique_ptr<CompactionInput> input;
   RETURN_NOT_OK(NewCompactionInput(rowset_metadata_->tablet_schema().get(),
                                    MvccSnapshot::CreateSnapshotIncludingAllOps(),
                                    nullptr, &input));
-  return DebugDumpCompactionInput(input.get(), lines);
+  return DebugDumpCompactionInput(input.get(), rows_left, lines);
 }
 
 } // namespace tablet
diff --git a/src/kudu/tablet/diskrowset.h b/src/kudu/tablet/diskrowset.h
index 4ebb848b8..228311bb9 100644
--- a/src/kudu/tablet/diskrowset.h
+++ b/src/kudu/tablet/diskrowset.h
@@ -57,6 +57,10 @@ class RowChangeList;
 class RowwiseIterator;
 class Timestamp;
 
+namespace tablet {
+class RowSetMetadata;
+}  // namespace tablet
+
 namespace cfile {
 class BloomFileWriter;
 class CFileWriter;
@@ -75,6 +79,13 @@ namespace log {
 class LogAnchorRegistry;
 }
 
+namespace tools {
+Status DumpRowSetInternal(const fs::IOContext& ctx,
+                          const std::shared_ptr<tablet::RowSetMetadata>& rs_meta,
+                          int indent,
+                          int64_t* rows_left);
+}
+
 namespace tablet {
 
 class CFileSet;
@@ -86,7 +97,6 @@ class MultiColumnWriter;
 class Mutation;
 class MvccSnapshot;
 class OperationResultPB;
-class RowSetMetadata;
 
 class DiskRowSetWriter {
  public:
@@ -456,8 +466,6 @@ class DiskRowSet :
         ToString());
   }
 
-  virtual Status DebugDump(std::vector<std::string> *lines) override;
-
  protected:
   DiskRowSet(std::shared_ptr<RowSetMetadata> rowset_metadata,
              log::LogAnchorRegistry* log_anchor_registry,
@@ -471,6 +479,11 @@ class DiskRowSet :
 
   friend class CompactionInput;
   friend class Tablet;
+  friend Status kudu::tools::DumpRowSetInternal(
+      const fs::IOContext& ctx,
+      const std::shared_ptr<tablet::RowSetMetadata>& rs_meta,
+      int indent,
+      int64_t* rows_left);
 
   Status Open(const fs::IOContext* io_context);
 
@@ -485,6 +498,8 @@ class DiskRowSet :
                                               const fs::IOContext* io_context,
                                               HistoryGcOpts history_gc_opts);
 
+  Status DebugDumpImpl(int64_t* rows_left, std::vector<std::string>* lines) override;
+
   std::shared_ptr<RowSetMetadata> rowset_metadata_;
 
   bool open_;
diff --git a/src/kudu/tablet/memrowset.cc b/src/kudu/tablet/memrowset.cc
index ea022ae2e..854375a6a 100644
--- a/src/kudu/tablet/memrowset.cc
+++ b/src/kudu/tablet/memrowset.cc
@@ -148,10 +148,13 @@ MemRowSet::MemRowSet(int64_t id,
 MemRowSet::~MemRowSet() {
 }
 
-Status MemRowSet::DebugDump(vector<string> *lines) {
+Status MemRowSet::DebugDumpImpl(int64_t* rows_left, vector<string>* lines) {
   unique_ptr<Iterator> iter(NewIterator());
   RETURN_NOT_OK(iter->Init(nullptr));
   while (iter->HasNext()) {
+    if (rows_left && (*rows_left)-- <= 0) {
+      break;
+    }
     MRSRow row = iter->GetCurrentRow();
     LOG_STRING(INFO, lines)
       << "@" << row.insertion_timestamp()
diff --git a/src/kudu/tablet/memrowset.h b/src/kudu/tablet/memrowset.h
index 4aa2c052f..291ca41d8 100644
--- a/src/kudu/tablet/memrowset.h
+++ b/src/kudu/tablet/memrowset.h
@@ -24,6 +24,7 @@
 #include <optional>
 #include <ostream>
 #include <string>
+#include <type_traits>
 #include <vector>
 
 #include <glog/logging.h>
@@ -377,12 +378,6 @@ class MemRowSet : public RowSet,
         reinterpret_cast<RowSetMetadata *>(NULL));
   }
 
-  // Dump the contents of the memrowset to the given vector.
-  // If 'lines' is NULL, dumps to LOG(INFO).
-  //
-  // This dumps every row, so should only be used in tests, etc.
-  virtual Status DebugDump(std::vector<std::string> *lines) override;
-
   std::string ToString() const override {
     return strings::Substitute("memrowset$0",
         txn_id_ ? strings::Substitute("(txn_id=$0)", *txn_id_) : "");
@@ -471,6 +466,8 @@ class MemRowSet : public RowSet,
                   const ConstContiguousRow& row,
                   MRSRow *ms_row);
 
+  Status DebugDumpImpl(int64_t* rows_left, std::vector<std::string>* lines) override;
+
   typedef btree::CBTree<MSBTreeTraits> MSBTree;
 
   int64_t id_;
diff --git a/src/kudu/tablet/mock-rowsets.h b/src/kudu/tablet/mock-rowsets.h
index a0ae3416c..701d2f8db 100644
--- a/src/kudu/tablet/mock-rowsets.h
+++ b/src/kudu/tablet/mock-rowsets.h
@@ -77,7 +77,7 @@ class MockRowSet : public RowSet {
     LOG(FATAL) << "Unimplemented";
     return "";
   }
-  Status DebugDump(std::vector<std::string>* /*lines*/) override {
+  Status DebugDumpImpl(int64_t* /*rows_left*/, std::vector<std::string>* /*lines*/) override {
     LOG(FATAL) << "Unimplemented";
     return Status::OK();
   }
diff --git a/src/kudu/tablet/rowset.cc b/src/kudu/tablet/rowset.cc
index 11b70989c..0fd72b78c 100644
--- a/src/kudu/tablet/rowset.cc
+++ b/src/kudu/tablet/rowset.cc
@@ -49,6 +49,10 @@ RowIteratorOptions::RowIteratorOptions()
       order(OrderMode::UNORDERED),
       include_deleted_rows(false) {}
 
+Status RowSet::DebugDump(std::vector<std::string>* lines) {
+  return DebugDumpImpl(nullptr /* rows_left */, lines);
+}
+
 Status RowSet::NewRowIteratorWithBounds(const RowIteratorOptions& opts,
                                         IterWithBounds* out) const {
   // Get the iterator.
@@ -288,19 +292,19 @@ shared_ptr<RowSetMetadata> DuplicatingRowSet::metadata() {
   return shared_ptr<RowSetMetadata>(reinterpret_cast<RowSetMetadata *>(NULL));
 }
 
-Status DuplicatingRowSet::DebugDump(vector<string> *lines) { // NOLINT(*)
+Status DuplicatingRowSet::DebugDumpImpl(int64_t* rows_left, vector<string>* lines) { // NOLINT(*)
   int i = 1;
   for (const shared_ptr<RowSet> &rs : old_rowsets_) {
     LOG_STRING(INFO, lines) << "Duplicating rowset input " << ToString() << " "
                             << i << "/" << old_rowsets_.size() << ":";
-    RETURN_NOT_OK(rs->DebugDump(lines));
+    RETURN_NOT_OK(rs->DebugDumpImpl(rows_left, lines));
     i++;
   }
   i = 1;
   for (const shared_ptr<RowSet> &rs : new_rowsets_) {
     LOG_STRING(INFO, lines) << "Duplicating rowset output " << ToString() << " "
                             << i << "/" << new_rowsets_.size() << ":";
-    RETURN_NOT_OK(rs->DebugDump(lines));
+    RETURN_NOT_OK(rs->DebugDumpImpl(rows_left, lines));
     i++;
   }
 
diff --git a/src/kudu/tablet/rowset.h b/src/kudu/tablet/rowset.h
index f759cb584..e04f8a686 100644
--- a/src/kudu/tablet/rowset.h
+++ b/src/kudu/tablet/rowset.h
@@ -175,9 +175,10 @@ class RowSet {
   // Return a displayable string for this rowset.
   virtual std::string ToString() const = 0;
 
-  // Dump the full contents of this rowset, for debugging.
-  // This is very verbose so only useful within unit tests.
-  virtual Status DebugDump(std::vector<std::string> *lines = nullptr) = 0;
+  // Dump the full contents of this rowset to the given vector, for debugging.
+  // This is very verbose so only useful within unit tests or CLI tools.
+  // If 'lines' is nullptr, dumps to LOG(INFO).
+  Status DebugDump(std::vector<std::string>* lines);
 
   // Return the size of this rowset on disk, in bytes.
   virtual uint64_t OnDiskSize() const = 0;
@@ -306,6 +307,12 @@ class RowSet {
   // Set after a compaction has completed to indicate that the rowset has been
   // removed from the rowset tree and is thus longer available for compaction.
   virtual void set_has_been_compacted() = 0;
+
+  // Similar to DebugDump, but adds an extra 'rows_left' parameter. DebugDump invokes
+  // DebugDumpImpl with 'rows_left' as nullptr.
+  // If 'rows_left' is nullptr, there is no limit on the number of rows to dump.
+  // If the content of 'rows_left' equal to or less than 0, no rows will be dumped.
+  virtual Status DebugDumpImpl(int64_t* rows_left, std::vector<std::string>* lines) = 0;
 };
 
 // Used often enough, may as well typedef it.
@@ -426,8 +433,6 @@ class DuplicatingRowSet : public RowSet {
 
   std::string ToString() const override;
 
-  virtual Status DebugDump(std::vector<std::string> *lines = nullptr) override;
-
   std::shared_ptr<RowSetMetadata> metadata() override;
 
   // A flush-in-progress rowset should never be selected for compaction.
@@ -507,6 +512,8 @@ class DuplicatingRowSet : public RowSet {
       const fs::IOContext* /*io_context*/) override { return Status::OK(); }
 
  private:
+  Status DebugDumpImpl(int64_t* rows_left, std::vector<std::string>* lines) override;
+
   friend class Tablet;
 
   DISALLOW_COPY_AND_ASSIGN(DuplicatingRowSet);
diff --git a/src/kudu/tools/kudu-tool-test.cc b/src/kudu/tools/kudu-tool-test.cc
index 21eb85101..b6a5694e6 100644
--- a/src/kudu/tools/kudu-tool-test.cc
+++ b/src/kudu/tools/kudu-tool-test.cc
@@ -2933,6 +2933,7 @@ TEST_F(ToolTest, TestLocalReplicaOps) {
   {
     // Dump rowsets' primary key bounds only.
     string stdout;
+    // Unset --dump_all_columns.
     NO_FATALS(RunActionStdoutString(
         Substitute("local_replica dump rowset --nodump_all_columns "
                    "--nodump_metadata --use_readable_format "
@@ -2952,6 +2953,26 @@ TEST_F(ToolTest, TestLocalReplicaOps) {
         ASSERT_STR_NOT_CONTAINS(stdout, row_key);
       }
     }
+
+    // Set --dump_all_columns.
+    NO_FATALS(RunActionStdoutString(
+        Substitute("local_replica dump rowset --dump_all_columns "
+                   "--nodump_metadata --nrows=15 $0 $1 $2",
+                   kTestTablet, fs_paths, encryption_args), &stdout));
+
+    SCOPED_TRACE(stdout);
+    ASSERT_STR_CONTAINS(stdout, "Dumping rowset 0");
+    ASSERT_STR_CONTAINS(stdout, "Dumping rowset 1");
+    ASSERT_STR_CONTAINS(stdout, "Dumping rowset 2");
+    ASSERT_STR_NOT_CONTAINS(stdout, "RowSet metadata");
+    for (int row_idx = 0; row_idx < 30; row_idx++) {
+      string row_prefix = Substitute("Base: (int32 key=$0", row_idx);
+      if (row_idx < 15) {
+        ASSERT_STR_CONTAINS(stdout, row_prefix);
+      } else {
+        ASSERT_STR_NOT_CONTAINS(stdout, row_prefix);
+      }
+    }
   }
   {
     TabletMetadata* meta = harness.tablet()->metadata();
diff --git a/src/kudu/tools/tool_action_local_replica.cc b/src/kudu/tools/tool_action_local_replica.cc
index e8ec825af..65b02ab83 100644
--- a/src/kudu/tools/tool_action_local_replica.cc
+++ b/src/kudu/tools/tool_action_local_replica.cc
@@ -20,6 +20,7 @@
 #include <cstdint>
 #include <functional>
 #include <iostream>
+#include <limits>
 #include <map>
 #include <memory>
 #include <mutex>
@@ -1033,14 +1034,20 @@ Status DumpRowSetInternal(const IOContext& ctx,
                           const shared_ptr<RowSetMetadata>& rs_meta,
                           int indent,
                           int64_t* rows_left) {
-  tablet::RowSetDataPB pb;
-  rs_meta->ToProtobuf(&pb);
-
+  // When the --dump_metadata flag is set to 'true', the metadata is always printed regardless of
+  // the number of rows to print.
   if (FLAGS_dump_metadata) {
+    tablet::RowSetDataPB pb;
+    rs_meta->ToProtobuf(&pb);
     cout << Indent(indent) << "RowSet metadata: " << pb_util::SecureDebugString(pb)
          << endl << endl;
   }
 
+  CHECK(rows_left);
+  if (*rows_left <= 0) {
+    return Status::OK();
+  }
+
   scoped_refptr<log::LogAnchorRegistry> log_reg(new log::LogAnchorRegistry());
   shared_ptr<DiskRowSet> rs;
   RETURN_NOT_OK(DiskRowSet::Open(rs_meta,
@@ -1050,7 +1057,7 @@ Status DumpRowSetInternal(const IOContext& ctx,
                                  &rs));
   vector<string> lines;
   if (FLAGS_dump_all_columns) {
-    RETURN_NOT_OK(rs->DebugDump(&lines));
+    RETURN_NOT_OK(rs->DebugDumpImpl(rows_left, &lines));
   } else {
     Schema key_proj = rs_meta->tablet_schema()->CreateKeyProjection();
     RowIteratorOptions opts;
@@ -1076,26 +1083,30 @@ Status DumpRowSetInternal(const IOContext& ctx,
         current_upper_bound = DumpRow(key_proj, block.row(block.nrows() - 1), &key);
       } else {
         for (int i = 0; i < block.nrows(); i++) {
+          if ((*rows_left)-- <= 0) {
+            break;
+          }
           lines.emplace_back(DumpRow(key_proj, block.row(i), &key));
         }
+        if (*rows_left <= 0) {
+          // There is no need to read the next block when the row limit has been reached.
+          break;
+        }
       }
     }
     if (FLAGS_dump_primary_key_bounds_only) {
+      // As long as 'rows_left' is greater than 0, lower and upper bounds are always printed in
+      // pairs.
+      (*rows_left) -= 2;
       lines.emplace_back(lower_bound);
       lines.emplace_back(current_upper_bound);
     }
   }
 
-  // Respect 'rows_left' when dumping the output.
-  int64_t limit = *rows_left >= 0 ?
-                  std::min<int64_t>(*rows_left, lines.size()) : lines.size();
-  for (int i = 0; i < limit; i++) {
-    cout << lines[i] << endl;
+  for (const auto& line : lines) {
+    cout << line << endl;
   }
 
-  if (*rows_left >= 0) {
-    *rows_left -= limit;
-  }
   return Status::OK();
 }
 
@@ -1115,7 +1126,7 @@ Status DumpRowSet(const RunnerContext& context) {
 
   IOContext ctx;
   ctx.tablet_id = meta->tablet_id();
-  int64_t rows_left = FLAGS_nrows;
+  int64_t rows_left = FLAGS_nrows < 0 ? std::numeric_limits<int64_t>::max() : FLAGS_nrows;
 
   // If rowset index is provided, only dump that rowset.
   if (FLAGS_rowset_index != -1) {
@@ -1130,9 +1141,8 @@ Status DumpRowSet(const RunnerContext& context) {
   }
 
   // Rowset index not provided, dump all rowsets
-  size_t idx = 0;
-  for (const auto& rs_meta : meta->rowsets())  {
-    cout << endl << "Dumping rowset " << idx++ << endl << kSeparatorLine;
+  for (const auto& rs_meta : meta->rowsets()) {
+    cout << endl << "Dumping rowset " << rs_meta->id() << endl << kSeparatorLine;
     RETURN_NOT_OK(DumpRowSetInternal(ctx, rs_meta, kIndent, &rows_left));
   }
   return Status::OK();
@@ -1196,11 +1206,13 @@ unique_ptr<Mode> BuildDumpMode() {
       .AddRequiredParameter({ kTabletIdArg, kTabletIdArgDesc })
       .AddOptionalParameter("dump_all_columns")
       .AddOptionalParameter("dump_metadata")
+      .AddOptionalParameter("dump_primary_key_bounds_only")
       .AddOptionalParameter("fs_data_dirs")
       .AddOptionalParameter("fs_metadata_dir")
       .AddOptionalParameter("fs_wal_dir")
       .AddOptionalParameter("nrows")
       .AddOptionalParameter("rowset_index")
+      .AddOptionalParameter("use_readable_format")
       .Build();
 
   unique_ptr<Action> dump_wals =