You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@doris.apache.org by GitBox <gi...@apache.org> on 2022/10/08 10:08:58 UTC

[GitHub] [doris] pengxiangyu commented on a diff in pull request #12897: [feature](remote) support local cache GC by disk usage

pengxiangyu commented on code in PR #12897:
URL: https://github.com/apache/doris/pull/12897#discussion_r990580003


##########
be/src/io/cache/file_cache_manager.cpp:
##########
@@ -18,15 +18,39 @@
 #include "io/cache/file_cache_manager.h"
 
 #include "gutil/strings/util.h"
+#include "io/cache/dummy_file_cache.h"
 #include "io/cache/sub_file_cache.h"
 #include "io/cache/whole_file_cache.h"
 #include "io/fs/local_file_system.h"
+#include "olap/storage_engine.h"
 #include "util/file_utils.h"
 #include "util/string_util.h"
 
 namespace doris {
 namespace io {
 
+void GCContextPerDisk::init(const std::string& path, int64_t max_size) {
+    _disk_path = path;
+    _conf_max_size = max_size;
+    _used_size = 0;
+}

Review Comment:
   Need an empty line after a function



##########
be/src/io/cache/dummy_file_cache.cpp:
##########
@@ -0,0 +1,143 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "io/cache/dummy_file_cache.h"
+
+#include "gutil/strings/util.h"
+#include "io/fs/local_file_system.h"
+#include "util/file_utils.h"
+#include "util/string_util.h"
+
+namespace doris {
+namespace io {
+
+const static std::string WHOLE_FILE_CACHE_NAME = "WHOLE_FILE_CACHE";

Review Comment:
   WHOLE_FILE_CACHE_NAME is not used.



##########
be/src/io/cache/file_cache_manager.cpp:
##########
@@ -56,88 +80,82 @@ void FileCacheManager::remove_file_cache(const std::string& cache_path) {
     }
 }
 
-void FileCacheManager::clean_timeout_caches() {
-    std::shared_lock<std::shared_mutex> rdlock(_cache_map_lock);
-    for (std::map<std::string, FileCachePtr>::const_iterator iter = _file_cache_map.cbegin();
-         iter != _file_cache_map.cend(); ++iter) {
-        if (iter->second == nullptr) {
-            continue;
+void FileCacheManager::_add_file_cache_for_gc_by_disk(std::vector<GCContextPerDisk>& contexts,
+                                                      FileCachePtr file_cache) {
+    // sort file cache by last match time
+    if (config::file_cache_max_size_per_disk > 0) {
+        auto file_size = file_cache->cache_file_size();
+        if (file_size <= 0) {
+            return;
+        }
+        for (size_t i = 0; i < contexts.size(); ++i) {
+            if (contexts[i].try_add_file_cache(file_cache, file_size)) {
+                break;
+            }
         }
-        iter->second->clean_timeout_cache();
     }
 }
+void FileCacheManager::gc_file_caches() {
+    int64_t gc_conf_size = config::file_cache_max_size_per_disk;
+    std::vector<GCContextPerDisk> contexts;
+    // init for GC by disk size
+    if (gc_conf_size > 0) {
+        std::vector<DataDir*> data_dirs = doris::StorageEngine::instance()->get_stores();
+        contexts.resize(data_dirs.size());
+        for (size_t i = 0; i < contexts.size(); ++i) {
+            contexts[i].init(data_dirs[i]->path(), gc_conf_size);
+        }
+    }
 
-void FileCacheManager::clean_timeout_file_not_in_mem(const std::string& cache_path) {
-    time_t now = time(nullptr);
+    // process unused file caches
     std::shared_lock<std::shared_mutex> rdlock(_cache_map_lock);
-    // Deal with caches not in _file_cache_map
-    if (_file_cache_map.find(cache_path) == _file_cache_map.end()) {
-        std::vector<Path> cache_file_names;
-        if (io::global_local_filesystem()->list(cache_path, &cache_file_names).ok()) {
-            std::map<std::string, bool> cache_names;
-            std::list<std::string> done_names;
-            for (Path cache_file_name : cache_file_names) {
-                std::string filename = cache_file_name.native();
-                if (!ends_with(filename, CACHE_DONE_FILE_SUFFIX)) {
-                    cache_names[filename] = true;
-                    continue;
-                }
-                done_names.push_back(filename);
-                std::stringstream done_file_ss;
-                done_file_ss << cache_path << "/" << filename;
-                std::string done_file_path = done_file_ss.str();
-                time_t m_time;
-                if (!FileUtils::mtime(done_file_path, &m_time).ok()) {
-                    continue;
-                }
-                if (now - m_time < config::file_cache_alive_time_sec) {
-                    continue;
-                }
-                std::string cache_file_path =
-                        StringReplace(done_file_path, CACHE_DONE_FILE_SUFFIX, "", true);
-                LOG(INFO) << "Delete timeout done_cache_path: " << done_file_path
-                          << ", cache_file_path: " << cache_file_path << ", m_time: " << m_time;
-                if (!io::global_local_filesystem()->delete_file(done_file_path).ok()) {
-                    LOG(ERROR) << "delete_file failed: " << done_file_path;
+    std::vector<TabletSharedPtr> tablets =
+            StorageEngine::instance()->tablet_manager()->get_all_tablet();
+    for (const auto& tablet : tablets) {
+        std::vector<Path> seg_file_paths;
+        if (io::global_local_filesystem()->list(tablet->tablet_path(), &seg_file_paths).ok()) {
+            for (Path seg_file : seg_file_paths) {
+                std::string seg_filename = seg_file.native();
+                // check if it is a dir name
+                if (ends_with(seg_filename, ".dat")) {
                     continue;
                 }
-                if (!io::global_local_filesystem()->delete_file(cache_file_path).ok()) {
-                    LOG(ERROR) << "delete_file failed: " << cache_file_path;
+                // skip file cache already in memory
+                std::stringstream ss;
+                ss << tablet->tablet_path() << "/" << seg_filename;
+                std::string cache_path = ss.str();
+                if (_file_cache_map.find(cache_path) != _file_cache_map.end()) {
                     continue;
                 }
+
+                auto file_cache = std::make_shared<DummyFileCache>(
+                        cache_path, config::file_cache_alive_time_sec);
+                // load cache meta from disk and clean unfinished cache files
+                file_cache->load_and_clean();
+                // policy1: GC file cache by timeout
+                file_cache->clean_timeout_cache();
+                // sort file cache by last match time
+                _add_file_cache_for_gc_by_disk(contexts, file_cache);
             }
-            // find cache file without done file.
-            for (std::list<std::string>::iterator itr = done_names.begin(); itr != done_names.end();
-                 ++itr) {
-                std::string cache_filename = StringReplace(*itr, CACHE_DONE_FILE_SUFFIX, "", true);
-                if (cache_names.find(cache_filename) != cache_names.end()) {
-                    cache_names.erase(cache_filename);
-                }
-            }
-            // remove cache file without done file
-            for (std::map<std::string, bool>::iterator itr = cache_names.begin();
-                 itr != cache_names.end(); ++itr) {
-                std::stringstream cache_file_ss;
-                cache_file_ss << cache_path << "/" << itr->first;
-                std::string cache_file_path = cache_file_ss.str();
-                time_t m_time;
-                if (!FileUtils::mtime(cache_file_path, &m_time).ok()) {
-                    continue;
-                }
-                if (now - m_time < config::file_cache_alive_time_sec) {
-                    continue;
-                }
-                LOG(INFO) << "Delete cache file without done file: " << cache_file_path;
-                if (!io::global_local_filesystem()->delete_file(cache_file_path).ok()) {
-                    LOG(ERROR) << "delete_file failed: " << cache_file_path;
-                }
-            }
-            if (io::global_local_filesystem()->list(cache_path, &cache_file_names).ok() &&
-                cache_file_names.size() == 0) {
-                if (global_local_filesystem()->delete_directory(cache_path).ok()) {
-                    LOG(INFO) << "Delete empty dir: " << cache_path;
-                }
-            }
+        }
+    }
+
+    // process file caches in memory

Review Comment:
   You need add a read lock for _file_cache_map by call std::shared_lock<std::shared_mutex> rdlock(_cache_map_lock);



##########
be/src/io/cache/file_cache_manager.cpp:
##########
@@ -56,88 +80,82 @@ void FileCacheManager::remove_file_cache(const std::string& cache_path) {
     }
 }
 
-void FileCacheManager::clean_timeout_caches() {
-    std::shared_lock<std::shared_mutex> rdlock(_cache_map_lock);
-    for (std::map<std::string, FileCachePtr>::const_iterator iter = _file_cache_map.cbegin();
-         iter != _file_cache_map.cend(); ++iter) {
-        if (iter->second == nullptr) {
-            continue;
+void FileCacheManager::_add_file_cache_for_gc_by_disk(std::vector<GCContextPerDisk>& contexts,
+                                                      FileCachePtr file_cache) {
+    // sort file cache by last match time
+    if (config::file_cache_max_size_per_disk > 0) {
+        auto file_size = file_cache->cache_file_size();
+        if (file_size <= 0) {
+            return;
+        }
+        for (size_t i = 0; i < contexts.size(); ++i) {
+            if (contexts[i].try_add_file_cache(file_cache, file_size)) {
+                break;
+            }
         }
-        iter->second->clean_timeout_cache();
     }
 }
+void FileCacheManager::gc_file_caches() {
+    int64_t gc_conf_size = config::file_cache_max_size_per_disk;
+    std::vector<GCContextPerDisk> contexts;
+    // init for GC by disk size
+    if (gc_conf_size > 0) {
+        std::vector<DataDir*> data_dirs = doris::StorageEngine::instance()->get_stores();
+        contexts.resize(data_dirs.size());
+        for (size_t i = 0; i < contexts.size(); ++i) {
+            contexts[i].init(data_dirs[i]->path(), gc_conf_size);
+        }
+    }
 
-void FileCacheManager::clean_timeout_file_not_in_mem(const std::string& cache_path) {
-    time_t now = time(nullptr);
+    // process unused file caches
     std::shared_lock<std::shared_mutex> rdlock(_cache_map_lock);
-    // Deal with caches not in _file_cache_map
-    if (_file_cache_map.find(cache_path) == _file_cache_map.end()) {
-        std::vector<Path> cache_file_names;
-        if (io::global_local_filesystem()->list(cache_path, &cache_file_names).ok()) {
-            std::map<std::string, bool> cache_names;
-            std::list<std::string> done_names;
-            for (Path cache_file_name : cache_file_names) {
-                std::string filename = cache_file_name.native();
-                if (!ends_with(filename, CACHE_DONE_FILE_SUFFIX)) {
-                    cache_names[filename] = true;
-                    continue;
-                }
-                done_names.push_back(filename);
-                std::stringstream done_file_ss;
-                done_file_ss << cache_path << "/" << filename;
-                std::string done_file_path = done_file_ss.str();
-                time_t m_time;
-                if (!FileUtils::mtime(done_file_path, &m_time).ok()) {
-                    continue;
-                }
-                if (now - m_time < config::file_cache_alive_time_sec) {
-                    continue;
-                }
-                std::string cache_file_path =
-                        StringReplace(done_file_path, CACHE_DONE_FILE_SUFFIX, "", true);
-                LOG(INFO) << "Delete timeout done_cache_path: " << done_file_path
-                          << ", cache_file_path: " << cache_file_path << ", m_time: " << m_time;
-                if (!io::global_local_filesystem()->delete_file(done_file_path).ok()) {
-                    LOG(ERROR) << "delete_file failed: " << done_file_path;
+    std::vector<TabletSharedPtr> tablets =
+            StorageEngine::instance()->tablet_manager()->get_all_tablet();
+    for (const auto& tablet : tablets) {
+        std::vector<Path> seg_file_paths;
+        if (io::global_local_filesystem()->list(tablet->tablet_path(), &seg_file_paths).ok()) {
+            for (Path seg_file : seg_file_paths) {
+                std::string seg_filename = seg_file.native();
+                // check if it is a dir name
+                if (ends_with(seg_filename, ".dat")) {
                     continue;
                 }
-                if (!io::global_local_filesystem()->delete_file(cache_file_path).ok()) {
-                    LOG(ERROR) << "delete_file failed: " << cache_file_path;
+                // skip file cache already in memory
+                std::stringstream ss;
+                ss << tablet->tablet_path() << "/" << seg_filename;
+                std::string cache_path = ss.str();
+                if (_file_cache_map.find(cache_path) != _file_cache_map.end()) {
                     continue;
                 }
+

Review Comment:
   A reused DummyFileCache is better,new DummyFileCache cost more time and this operation is too heavy.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@doris.apache.org
For additional commands, e-mail: commits-help@doris.apache.org