You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by mo...@apache.org on 2023/06/26 03:37:20 UTC
[doris] branch master updated: [improvement](fs_bench) optimize the usage of fs benchmark tool for hdfs (#21154)
This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 1dec592e91 [improvement](fs_bench) optimize the usage of fs benchmark tool for hdfs (#21154)
1dec592e91 is described below
commit 1dec592e917905ab6c4a253fee54c2f22d8fdaf9
Author: Mingyu Chen <mo...@163.com>
AuthorDate: Mon Jun 26 11:37:14 2023 +0800
[improvement](fs_bench) optimize the usage of fs benchmark tool for hdfs (#21154)
Optimize the usage of fs benchmark tool:
1. Remove `Open` benchmark, it is useless.
2. Remove `Delete` benchmark, it is dangerous.
3. Add `SingleRead` benchmark, user can specify an exist file to test read operation:
`sh bin/run-fs-benchmark.sh --conf=conf/hdfs_read.conf --fs_type=hdfs --operation=single_read`
4. Modify the `run-fs-benchmark.sh`, remove `OPTS` section, use options in `fs_benchmark_tool` directly
5. Add some custom counters in the benchmark result, eg:
```
--------------------------------------------------------------------------------------------------------------------------------
Benchmark Time CPU Iterations UserCounters...
--------------------------------------------------------------------------------------------------------------------------------
HdfsReadBenchmark/iterations:1/repeats:3/manual_time/threads:1 6864 ms 2385 ms 1 ReadRate=200.936M/s
HdfsReadBenchmark/iterations:1/repeats:3/manual_time/threads:1 3919 ms 1828 ms 1 ReadRate=351.96M/s
HdfsReadBenchmark/iterations:1/repeats:3/manual_time/threads:1 3839 ms 1819 ms 1 ReadRate=359.265M/s
HdfsReadBenchmark/iterations:1/repeats:3/manual_time/threads:1_mean 4874 ms 2011 ms 3 ReadRate=304.054M/s
HdfsReadBenchmark/iterations:1/repeats:3/manual_time/threads:1_median 3919 ms 1828 ms 3 ReadRate=351.96M/s
HdfsReadBenchmark/iterations:1/repeats:3/manual_time/threads:1_stddev 1724 ms 324 ms 3 ReadRate=89.3768M/s
HdfsReadBenchmark/iterations:1/repeats:3/manual_time/threads:1_cv 35.37 % 16.11 % 3 ReadRate=29.40%
HdfsReadBenchmark/iterations:1/repeats:3/manual_time/threads:1_max 6864 ms 2385 ms 3 ReadRate=359.265M/s
HdfsReadBenchmark/iterations:1/repeats:3/manual_time/threads:1_min 3839 ms 1819 ms 3 ReadRate=200.936M/s
```
- For `open_read` and `single_read`, add `ReadRate` as `bytes per second`.
- For `create_write`, add `WriteRate` as `bytes per second`.
- For `exists` and `rename`, add `ExistsCost` and `RenameCost` as `time cost per one operation`.
---
be/src/io/fs/benchmark/base_benchmark.h | 2 +-
be/src/io/fs/benchmark/benchmark_factory.hpp | 12 ++-
be/src/io/fs/benchmark/fs_benchmark_tool.cpp | 24 ++---
be/src/io/fs/benchmark/hdfs_benchmark.hpp | 126 +++++++++------------------
bin/run-fs-benchmark.sh | 67 +-------------
build.sh | 4 +
6 files changed, 68 insertions(+), 167 deletions(-)
diff --git a/be/src/io/fs/benchmark/base_benchmark.h b/be/src/io/fs/benchmark/base_benchmark.h
index bcc8ed284c..c28ad02de5 100644
--- a/be/src/io/fs/benchmark/base_benchmark.h
+++ b/be/src/io/fs/benchmark/base_benchmark.h
@@ -97,7 +97,7 @@ protected:
int _threads;
int _iterations;
size_t _file_size;
- int _repetitions = 3;
+ int _repetitions = 1;
std::map<std::string, std::string> _conf_map;
};
diff --git a/be/src/io/fs/benchmark/benchmark_factory.hpp b/be/src/io/fs/benchmark/benchmark_factory.hpp
index bc73f904be..3e8c9314ca 100644
--- a/be/src/io/fs/benchmark/benchmark_factory.hpp
+++ b/be/src/io/fs/benchmark/benchmark_factory.hpp
@@ -50,12 +50,10 @@ Status BenchmarkFactory::getBm(const std::string fs_type, const std::string op_t
*bm = new HdfsCreateWriteBenchmark(threads, iterations, file_size, conf_map);
} else if (op_type == "open_read") {
*bm = new HdfsOpenReadBenchmark(threads, iterations, file_size, conf_map);
- } else if (op_type == "open") {
- *bm = new HdfsOpenBenchmark(threads, iterations, file_size, conf_map);
+ } else if (op_type == "single_read") {
+ *bm = new HdfsSingleReadBenchmark(threads, iterations, file_size, conf_map);
} else if (op_type == "rename") {
*bm = new HdfsRenameBenchmark(threads, iterations, file_size, conf_map);
- } else if (op_type == "delete") {
- *bm = new HdfsDeleteBenchmark(threads, iterations, file_size, conf_map);
} else if (op_type == "exists") {
*bm = new HdfsExistsBenchmark(threads, iterations, file_size, conf_map);
} else {
@@ -80,7 +78,7 @@ public:
_conf_map(conf_map) {}
~MultiBenchmark() {
- for (auto bm : benchmarks) {
+ for (auto bm : _benchmarks) {
delete bm;
}
}
@@ -113,12 +111,12 @@ public:
return st;
}
bm->register_bm();
- benchmarks.emplace_back(bm);
+ _benchmarks.emplace_back(bm);
return Status::OK();
}
private:
- std::vector<BaseBenchmark*> benchmarks;
+ std::vector<BaseBenchmark*> _benchmarks;
std::string _type;
std::string _operation;
int64_t _threads;
diff --git a/be/src/io/fs/benchmark/fs_benchmark_tool.cpp b/be/src/io/fs/benchmark/fs_benchmark_tool.cpp
index 4002ea84ac..a5be5db80a 100644
--- a/be/src/io/fs/benchmark/fs_benchmark_tool.cpp
+++ b/be/src/io/fs/benchmark/fs_benchmark_tool.cpp
@@ -21,12 +21,13 @@
#include "io/fs/benchmark/benchmark_factory.hpp"
-DEFINE_string(fs_type, "hdfs", "Supported File System: s3, hdfs, local");
+DEFINE_string(fs_type, "hdfs", "Supported File System: s3, hdfs");
DEFINE_string(operation, "create_write",
"Supported Operations: create_write, open_read, open, rename, delete, exists");
-DEFINE_string(threads, "10", "Number of threads");
-DEFINE_string(iterations, "10", "Number of runs");
-DEFINE_string(file_size, "104857600", "File size");
+DEFINE_string(threads, "1", "Number of threads");
+DEFINE_string(iterations, "1", "Number of runs of each thread");
+DEFINE_string(repetitions, "1", "Number of iterations");
+DEFINE_string(file_size, "0", "File size for read/write opertions");
DEFINE_string(conf, "", "config file");
std::string get_usage(const std::string& progname) {
@@ -35,8 +36,9 @@ std::string get_usage(const std::string& progname) {
ss << "Usage:\n";
ss << progname
- << " --fs_type=[fs_type] --operation=[op_type] --threads=10 --iterations=10 "
- "--file_size=104857600\n";
+ << " --fs_type=[fs_type] --operation=[op_type] --threads=[num] --iterations=[num] "
+ "--repetitions=[num] "
+ "--file_size=[num]\n";
ss << "\nfs_type:\n";
ss << " hdfs\n";
ss << " s3\n";
@@ -46,13 +48,15 @@ std::string get_usage(const std::string& progname) {
ss << "\nthreads:\n";
ss << " num of threads\n";
ss << "\niterations:\n";
- ss << " num of run\n";
+ ss << " Number of runs of each thread\n";
+ ss << "\nrepetitions:\n";
+ ss << " Number of iterations\n";
ss << "\nfile_size:\n";
- ss << " file size\n";
+ ss << " File size for read/write opertions\n";
ss << "\nExample:\n";
ss << progname
- << " --conf my.conf --fs_type=s3 --operation=read --threads=10 --iterations=100 "
- "--file_size=104857600\n";
+ << " --conf my.conf --fs_type=hdfs --operation=create_write --threads=2 --iterations=100 "
+ "--file_size=1048576\n";
return ss.str();
}
diff --git a/be/src/io/fs/benchmark/hdfs_benchmark.hpp b/be/src/io/fs/benchmark/hdfs_benchmark.hpp
index 637f7a614a..1307ddc95a 100644
--- a/be/src/io/fs/benchmark/hdfs_benchmark.hpp
+++ b/be/src/io/fs/benchmark/hdfs_benchmark.hpp
@@ -38,17 +38,23 @@ public:
Status init() override { return Status::OK(); }
+ virtual std::string get_file_path(benchmark::State& state) {
+ std::string base_dir = _conf_map["base_dir"];
+ auto file_path = fmt::format("{}/test_{}", base_dir, state.thread_index());
+ bm_log("file_path: {}", file_path);
+ return file_path;
+ }
+
Status run(benchmark::State& state) override {
std::shared_ptr<io::FileSystem> fs;
io::FileReaderSPtr reader;
bm_log("begin to init {}", _name);
- std::string base_dir = _conf_map["baseDir"];
size_t buffer_size =
_conf_map.contains("buffer_size") ? std::stol(_conf_map["buffer_size"]) : 1000000L;
io::FileReaderOptions reader_opts = FileFactory::get_reader_options(nullptr);
THdfsParams hdfs_params = parse_properties(_conf_map);
- auto file_path = fmt::format("{}/test_{}", base_dir, state.thread_index());
- bm_log("file_path: {}", file_path);
+
+ auto file_path = get_file_path(state);
RETURN_IF_ERROR(
FileFactory::create_hdfs_reader(hdfs_params, file_path, &fs, &reader, reader_opts));
bm_log("finish to init {}", _name);
@@ -61,10 +67,13 @@ public:
size_t offset = 0;
size_t bytes_read = 0;
- auto start = std::chrono::high_resolution_clock::now();
- size_t read_size = _file_size;
+ size_t read_size = reader->size();
+ if (_file_size > 0) {
+ read_size = std::min(read_size, _file_size);
+ }
long remaining_size = read_size;
+ auto start = std::chrono::high_resolution_clock::now();
while (remaining_size > 0) {
bytes_read = 0;
size_t size = std::min(buffer_size, (size_t)remaining_size);
@@ -81,13 +90,13 @@ public:
remaining_size -= bytes_read;
}
bm_log("finish to run {}", _name);
-
auto end = std::chrono::high_resolution_clock::now();
auto elapsed_seconds =
std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
state.SetIterationTime(elapsed_seconds.count());
+ state.counters["ReadRate"] = benchmark::Counter(read_size, benchmark::Counter::kIsRate);
if (reader != nullptr) {
reader->close();
@@ -96,41 +105,19 @@ public:
}
};
-class HdfsOpenBenchmark : public BaseBenchmark {
+// Read a single specified file
+class HdfsSingleReadBenchmark : public HdfsOpenReadBenchmark {
public:
- HdfsOpenBenchmark(int threads, int iterations, size_t file_size,
- const std::map<std::string, std::string>& conf_map)
- : BaseBenchmark("HdfsOpenBenchmark", threads, iterations, file_size, 3, conf_map) {}
- virtual ~HdfsOpenBenchmark() = default;
-
- Status init() override { return Status::OK(); }
+ HdfsSingleReadBenchmark(int threads, int iterations, size_t file_size,
+ const std::map<std::string, std::string>& conf_map)
+ : HdfsOpenReadBenchmark(threads, iterations, file_size, conf_map) {}
+ virtual ~HdfsSingleReadBenchmark() = default;
- Status run(benchmark::State& state) override {
- bm_log("begin to run {}", _name);
- auto start = std::chrono::high_resolution_clock::now();
- std::string base_dir = _conf_map["baseDir"];
- io::FileReaderOptions reader_opts = FileFactory::get_reader_options(nullptr);
- THdfsParams hdfs_params = parse_properties(_conf_map);
- auto file_path = fmt::format("{}/test_{}", base_dir, state.thread_index());
+ virtual std::string get_file_path(benchmark::State& state) override {
+ std::string file_path = _conf_map["file_path"];
bm_log("file_path: {}", file_path);
- std::shared_ptr<io::HdfsFileSystem> fs;
- io::FileReaderSPtr reader;
- RETURN_IF_ERROR(io::HdfsFileSystem::create(hdfs_params, "", &fs));
- RETURN_IF_ERROR(fs->open_file(file_path, reader_opts, &reader));
- auto end = std::chrono::high_resolution_clock::now();
- auto elapsed_seconds =
- std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
-
- state.SetIterationTime(elapsed_seconds.count());
- bm_log("finish to run {}", _name);
-
- if (reader != nullptr) {
- reader->close();
- }
- return Status::OK();
+ return file_path;
}
-
-private:
};
class HdfsCreateWriteBenchmark : public BaseBenchmark {
@@ -141,24 +128,17 @@ public:
conf_map) {}
virtual ~HdfsCreateWriteBenchmark() = default;
- Status init() override {
- std::string base_dir = _conf_map["baseDir"];
- io::FileReaderOptions reader_opts = FileFactory::get_reader_options(nullptr);
- THdfsParams hdfs_params = parse_properties(_conf_map);
- std::shared_ptr<io::HdfsFileSystem> fs;
- RETURN_IF_ERROR(io::HdfsFileSystem::create(hdfs_params, "", &fs));
- RETURN_IF_ERROR(fs->delete_directory(base_dir));
- return Status::OK();
- }
+ Status init() override { return Status::OK(); }
Status run(benchmark::State& state) override {
bm_log("begin to run {}", _name);
- auto start = std::chrono::high_resolution_clock::now();
- std::string base_dir = _conf_map["baseDir"];
+ std::string base_dir = _conf_map["base_dir"];
io::FileReaderOptions reader_opts = FileFactory::get_reader_options(nullptr);
THdfsParams hdfs_params = parse_properties(_conf_map);
auto file_path = fmt::format("{}/test_{}", base_dir, state.thread_index());
bm_log("file_path: {}", file_path);
+
+ auto start = std::chrono::high_resolution_clock::now();
std::shared_ptr<io::HdfsFileSystem> fs;
io::FileWriterPtr writer;
RETURN_IF_ERROR(io::HdfsFileSystem::create(hdfs_params, "", &fs));
@@ -188,6 +168,8 @@ public:
state.SetIterationTime(elapsed_seconds.count());
bm_log("finish to run {}", _name);
+ state.counters["WriteRate"] = benchmark::Counter(write_size, benchmark::Counter::kIsRate);
+
if (writer != nullptr) {
writer->close();
}
@@ -206,13 +188,14 @@ public:
Status run(benchmark::State& state) override {
bm_log("begin to run {}", _name);
- auto start = std::chrono::high_resolution_clock::now();
- std::string base_dir = _conf_map["baseDir"];
+ std::string base_dir = _conf_map["base_dir"];
io::FileReaderOptions reader_opts = FileFactory::get_reader_options(nullptr);
THdfsParams hdfs_params = parse_properties(_conf_map);
auto file_path = fmt::format("{}/test_{}", base_dir, state.thread_index());
auto new_file_path = fmt::format("{}/test_{}_new", base_dir, state.thread_index());
bm_log("file_path: {}", file_path);
+
+ auto start = std::chrono::high_resolution_clock::now();
std::shared_ptr<io::HdfsFileSystem> fs;
io::FileWriterPtr writer;
RETURN_IF_ERROR(io::HdfsFileSystem::create(hdfs_params, "", &fs));
@@ -224,6 +207,9 @@ public:
state.SetIterationTime(elapsed_seconds.count());
bm_log("finish to run {}", _name);
+ state.counters["RenameCost"] =
+ benchmark::Counter(1, benchmark::Counter::kIsRate | benchmark::Counter::kInvert);
+
if (writer != nullptr) {
writer->close();
}
@@ -233,38 +219,6 @@ public:
private:
};
-class HdfsDeleteBenchmark : public BaseBenchmark {
-public:
- HdfsDeleteBenchmark(int threads, int iterations, size_t file_size,
- const std::map<std::string, std::string>& conf_map)
- : BaseBenchmark("HdfsDeleteBenchmark", threads, 1, file_size, 1, conf_map) {}
- virtual ~HdfsDeleteBenchmark() = default;
-
- Status init() override { return Status::OK(); }
-
- Status run(benchmark::State& state) override {
- bm_log("begin to run {}", _name);
- auto start = std::chrono::high_resolution_clock::now();
- std::string base_dir = _conf_map["baseDir"];
- io::FileReaderOptions reader_opts = FileFactory::get_reader_options(nullptr);
- THdfsParams hdfs_params = parse_properties(_conf_map);
- auto file_path = fmt::format("{}/test_{}", base_dir, state.thread_index());
- bm_log("file_path: {}", file_path);
- std::shared_ptr<io::HdfsFileSystem> fs;
- RETURN_IF_ERROR(io::HdfsFileSystem::create(hdfs_params, "", &fs));
- RETURN_IF_ERROR(fs->delete_file(file_path));
- auto end = std::chrono::high_resolution_clock::now();
- auto elapsed_seconds =
- std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
-
- state.SetIterationTime(elapsed_seconds.count());
- bm_log("finish to run {}", _name);
- return Status::OK();
- }
-
-private:
-};
-
class HdfsExistsBenchmark : public BaseBenchmark {
public:
HdfsExistsBenchmark(int threads, int iterations, size_t file_size,
@@ -276,12 +230,13 @@ public:
Status run(benchmark::State& state) override {
bm_log("begin to run {}", _name);
- auto start = std::chrono::high_resolution_clock::now();
- std::string base_dir = _conf_map["baseDir"];
+ std::string base_dir = _conf_map["base_dir"];
io::FileReaderOptions reader_opts = FileFactory::get_reader_options(nullptr);
THdfsParams hdfs_params = parse_properties(_conf_map);
auto file_path = fmt::format("{}/test_{}", base_dir, state.thread_index());
bm_log("file_path: {}", file_path);
+
+ auto start = std::chrono::high_resolution_clock::now();
std::shared_ptr<io::HdfsFileSystem> fs;
RETURN_IF_ERROR(io::HdfsFileSystem::create(hdfs_params, "", &fs));
bool res = false;
@@ -292,6 +247,9 @@ public:
state.SetIterationTime(elapsed_seconds.count());
bm_log("finish to run {}", _name);
+
+ state.counters["ExistsCost"] =
+ benchmark::Counter(1, benchmark::Counter::kIsRate | benchmark::Counter::kInvert);
return Status::OK();
}
};
diff --git a/bin/run-fs-benchmark.sh b/bin/run-fs-benchmark.sh
index bf433ef178..9eb47d3ee8 100755
--- a/bin/run-fs-benchmark.sh
+++ b/bin/run-fs-benchmark.sh
@@ -26,58 +26,6 @@ if [[ "$(uname -s)" == 'Darwin' ]] && command -v brew &>/dev/null; then
export PATH
fi
-OPTS="$(getopt \
- -n "$0" \
- -o '' \
- -l 'conf:,fs_type:,operation:,threads:,iterations:,file_size:' \
- -- "$@")"
-
-eval set -- "${OPTS}"
-
-while true; do
- case "$1" in
- --conf)
- CONF="$2"
- shift 2
- ;;
- --fs_type)
- FS_TYPE="$2"
- shift 2
- ;;
- --operation)
- OPERATION="$2"
- shift 2
- ;;
- --threads)
- THREADS="$2"
- shift 2
- ;;
- --iterations)
- ITERATIONS="$2"
- shift 2
- ;;
- --file_size)
- FILE_SIZE="$2"
- shift 2
- ;;
- --)
- shift
- break
- ;;
- *)
- echo "Internal error"
- exit 1
- ;;
- esac
-done
-
-echo "CONF: ${CONF}"
-echo "FS_TYPE: ${FS_TYPE}"
-echo "OPERATION: ${OPERATION}"
-echo "THREADS: ${THREADS}"
-echo "ITERATIONS: ${ITERATIONS}"
-echo "FILE_SIZE: ${FILE_SIZE}"
-
DORIS_HOME="$(
cd "${curdir}/.."
pwd
@@ -316,16 +264,5 @@ export LIBHDFS_OPTS="${final_java_opt}"
# see https://github.com/apache/doris/blob/master/docs/zh-CN/community/developer-guide/debug-tool.md#jemalloc-heap-profile
export JEMALLOC_CONF="percpu_arena:percpu,background_thread:true,metadata_thp:auto,muzzy_decay_ms:30000,dirty_decay_ms:30000,oversize_threshold:0,lg_tcache_max:16,prof_prefix:jeprof.out"
-${LIMIT:+${LIMIT}} "${DORIS_HOME}/lib/fs_benchmark_tool" --conf "${CONF}" --fs_type="${FS_TYPE}" --operation="${OPERATION}" --threads="${THREADS}" --iterations="${ITERATIONS}" --file_size="${FILE_SIZE}" 2>&1 | tee "${LOG_DIR}/fs_benchmark_tool.log"
-
-qps="0MB/s"
-latency="0ms"
-
-eval "$(grep "median" "${LOG_DIR}/fs_benchmark_tool.log" | awk '{printf("qps=%sMB/s latency=%sms", "'"${FILE_SIZE}"'" / 1024 / 1024 / ($2 * "'"${THREADS}"'" / 1000), $2 * "'"${THREADS}"'")}')"
-
-echo "------------------------------"
-echo " Benchmark Result "
-echo "------------------------------"
-echo "thread_num: ${THREADS}."
-echo "qps: ${qps}."
-echo "latency: ${latency}."
+echo "$@"
+${LIMIT:+${LIMIT}} "${DORIS_HOME}/lib/fs_benchmark_tool" "$@" 2>&1 | tee "${LOG_DIR}/fs_benchmark_tool.log"
diff --git a/build.sh b/build.sh
index 44f69fc88f..dfc16b7936 100755
--- a/build.sh
+++ b/build.sh
@@ -637,6 +637,10 @@ EOF
cp -r -p "${DORIS_HOME}/be/output/lib/debug_info" "${DORIS_OUTPUT}/be/lib"/
fi
+ if [[ "${BUILD_FS_BENCHMARK}" = "ON" ]]; then
+ cp -r -p "${DORIS_HOME}/bin/run-fs-benchmark.sh" "${DORIS_OUTPUT}/be/bin/"/
+ fi
+
extensions_modules=("")
extensions_modules+=("java-udf")
extensions_modules+=("jdbc-scanner")
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org