You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2019/01/23 16:36:50 UTC

[impala] 03/04: IMPALA-7941: part 1: detect cgroups memory limit

This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit d1f2c1a2269604efb9a498c0a0137bb4ea4eff07
Author: Tim Armstrong <ta...@cloudera.com>
AuthorDate: Fri Dec 21 07:30:43 2018 -0800

    IMPALA-7941: part 1: detect cgroups memory limit
    
    This adds the logic to detect the cgroups memory limit,
    but does not use it to set the process memory limit yet.
    The information obtained is logged at startup and accessible
    through the Web UI.
    
    The patch boils down to reading from the appropriate
    places in the filesystem. The main complication is that paths need to be
    translated to point to the right cgroup node inside the container.
    
    This deletes some useless cgroup logic from ProcessStateInfo that
    printed whatever happened to be the last cgroup in a file.
    
    Testing:
    Added a unit test to check that the code could parse the cgroups
    hierarchy ok and return a positive value.
    
    Tested on CentOS6 and CentOS7.
    
    Change-Id: Ic0f5ed0e94511361bf5605822c0874c16c45ffa9
    Reviewed-on: http://gerrit.cloudera.org:8080/12120
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/common/init.cc                |   2 +
 be/src/util/CMakeLists.txt           |   1 +
 be/src/util/cgroup-util.cc           | 177 +++++++++++++++++++++++++++++++++++
 be/src/util/cgroup-util.h            |  62 ++++++++++++
 be/src/util/default-path-handlers.cc |  27 +++---
 be/src/util/proc-info-test.cc        |  21 +++++
 be/src/util/process-state-info.cc    |  34 ++-----
 be/src/util/process-state-info.h     |  13 +--
 www/root.tmpl                        |   4 +
 9 files changed, 289 insertions(+), 52 deletions(-)

diff --git a/be/src/common/init.cc b/be/src/common/init.cc
index fcd6971..a9d00ce 100644
--- a/be/src/common/init.cc
+++ b/be/src/common/init.cc
@@ -38,6 +38,7 @@
 #include "runtime/hdfs-fs-cache.h"
 #include "runtime/lib-cache.h"
 #include "runtime/mem-tracker.h"
+#include "util/cgroup-util.h"
 #include "util/cpu-info.h"
 #include "util/debug-util.h"
 #include "util/decimal-util.h"
@@ -282,6 +283,7 @@ void impala::InitCommonRuntime(int argc, char** argv, bool init_jvm,
   LOG(INFO) << DiskInfo::DebugString();
   LOG(INFO) << MemInfo::DebugString();
   LOG(INFO) << OsInfo::DebugString();
+  LOG(INFO) << CGroupUtil::DebugString();
   LOG(INFO) << "Process ID: " << getpid();
   LOG(INFO) << "Default AES cipher mode for spill-to-disk: "
             << EncryptionKey::ModeToString(EncryptionKey::GetSupportedDefaultMode());
diff --git a/be/src/util/CMakeLists.txt b/be/src/util/CMakeLists.txt
index 62148f4..1405dd8 100644
--- a/be/src/util/CMakeLists.txt
+++ b/be/src/util/CMakeLists.txt
@@ -35,6 +35,7 @@ add_library(Util
   bit-util.cc
   bloom-filter.cc
   bloom-filter-ir.cc
+  cgroup-util.cc
   coding-util.cc
   codec.cc
   common-metrics.cc
diff --git a/be/src/util/cgroup-util.cc b/be/src/util/cgroup-util.cc
new file mode 100644
index 0000000..412ec55
--- /dev/null
+++ b/be/src/util/cgroup-util.cc
@@ -0,0 +1,177 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "util/cgroup-util.h"
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <utility>
+
+#include <boost/algorithm/string.hpp>
+
+#include "gutil/strings/escaping.h"
+#include "gutil/strings/substitute.h"
+#include "util/error-util.h"
+#include "util/string-parser.h"
+
+#include "common/names.h"
+
+using boost::algorithm::is_any_of;
+using boost::algorithm::split;
+using boost::algorithm::token_compress_on;
+using strings::CUnescape;
+using std::pair;
+
+namespace impala {
+
+Status CGroupUtil::FindGlobalCGroup(const string& subsystem, string* path) {
+  ifstream proc_cgroups("/proc/self/cgroup", ios::in);
+  string line;
+  while (true) {
+    if (proc_cgroups.fail() || proc_cgroups.bad()) {
+      return Status(Substitute("Error reading /proc/self/cgroup: $0", GetStrErrMsg()));
+    } else if (proc_cgroups.eof()) {
+      return Status(
+          Substitute("Could not find subsystem $0 in /proc/self/cgroup", subsystem));
+    }
+    // The line format looks like this:
+    // 4:memory:/user.slice
+    // 9:cpu,cpuacct:/user.slice
+    getline(proc_cgroups, line);
+    if (!proc_cgroups.good()) continue;
+    vector<string> fields;
+    split(fields, line, is_any_of(":"));
+    DCHECK_GE(fields.size(), 3);
+    // ":" in the path does not appear to be escaped - bail in the unusual case that
+    // we get too many tokens.
+    if (fields.size() > 3) {
+      return Status(Substitute(
+          "Could not parse line from /proc/self/cgroup - had $0 > 3 tokens: '$1'",
+          fields.size(), line));
+    }
+    vector<string> subsystems;
+    split(subsystems, fields[1], is_any_of(","));
+    auto it = std::find(subsystems.begin(), subsystems.end(), subsystem);
+    if (it != subsystems.end()) {
+      *path = move(fields[2]);
+      return Status::OK();
+    }
+  }
+}
+
+static Status UnescapePath(const string& escaped, string* unescaped) {
+  string err;
+  if (!CUnescape(escaped, unescaped, &err)) {
+    return Status(Substitute("Could not unescape path '$0': $1", escaped, err));
+  }
+  return Status::OK();
+}
+
+Status CGroupUtil::FindCGroupMounts(
+    const string& subsystem, pair<string, string>* result) {
+  ifstream mountinfo("/proc/self/mountinfo", ios::in);
+  string line;
+  while (true) {
+    if (mountinfo.fail() || mountinfo.bad()) {
+      return Status(Substitute("Error reading /proc/self/mountinfo: $0", GetStrErrMsg()));
+    } else if (mountinfo.eof()) {
+      return Status(
+          Substitute("Could not find subsystem $0 in /proc/self/mountinfo", subsystem));
+    }
+    // The relevant lines look like below (see proc manpage for full documentation). The
+    // first example is running outside of a container, the second example is running
+    // inside a docker container. Field 3 is the path relative to the root CGroup on
+    // the host and Field 4 is the mount point from this process's point of view.
+    // 34 29 0:28 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:15 -
+    //    cgroup cgroup rw,memory
+    // 275 271 0:28 /docker/f23eee6f88c2ba99fcce /sys/fs/cgroup/memory
+    //    ro,nosuid,nodev,noexec,relatime master:15 - cgroup cgroup rw,memory
+    getline(mountinfo, line);
+    if (!mountinfo.good()) continue;
+    vector<string> fields;
+    split(fields, line, is_any_of(" "), token_compress_on);
+    DCHECK_GE(fields.size(), 7);
+
+    if (fields[fields.size() - 3] != "cgroup") continue;
+    // This is a cgroup mount. Check if it's the mount we're looking for.
+    vector<string> cgroup_opts;
+    split(cgroup_opts, fields[fields.size() - 1], is_any_of(","), token_compress_on);
+    auto it = std::find(cgroup_opts.begin(), cgroup_opts.end(), subsystem);
+    if (it == cgroup_opts.end()) continue;
+    // This is the right mount.
+    string mount_path, system_path;
+    RETURN_IF_ERROR(UnescapePath(fields[4], &mount_path));
+    RETURN_IF_ERROR(UnescapePath(fields[3], &system_path));
+    // Strip trailing "/" so that both returned paths match in whether they have a
+    // trailing "/".
+    if (system_path[system_path.size() - 1] == '/') system_path.pop_back();
+    *result = {mount_path, system_path};
+    return Status::OK();
+  }
+}
+
+Status CGroupUtil::FindAbsCGroupPath(const string& subsystem, string* path) {
+  RETURN_IF_ERROR(FindGlobalCGroup(subsystem, path));
+  pair<string, string> paths;
+  RETURN_IF_ERROR(FindCGroupMounts(subsystem, &paths));
+  const string& mount_path = paths.first;
+  const string& system_path = paths.second;
+  if (path->compare(0, system_path.size(), system_path) != 0) {
+    return Status(
+        Substitute("Expected CGroup path '$0' to start with '$1'", *path, system_path));
+  }
+  path->replace(0, system_path.size(), mount_path);
+  return Status::OK();
+}
+
+Status CGroupUtil::FindCGroupMemLimit(int64_t* bytes) {
+  string cgroup_path;
+  RETURN_IF_ERROR(FindAbsCGroupPath("memory", &cgroup_path));
+  string limit_file_path = cgroup_path + "/memory.limit_in_bytes";
+  ifstream limit_file(limit_file_path, ios::in);
+  string line;
+  getline(limit_file, line);
+  if (limit_file.fail() || limit_file.bad()) {
+    return Status(Substitute("Error reading $0: $1", limit_file_path, GetStrErrMsg()));
+  }
+  StringParser::ParseResult pr;
+  // Parse into an an int64_t, since that is what we use for memory amounts elsewhere in
+  // the codebase. If it overflows, returning the max value of int64_t is ok because that
+  // is effectively unlimited.
+  *bytes = StringParser::StringToInt<int64_t>(line.c_str(), line.size(), &pr);
+  if ((pr != StringParser::PARSE_SUCCESS && pr != StringParser::PARSE_OVERFLOW)
+      || *bytes < 0) {
+    return Status(
+        Substitute("Failed to parse $0 as positive int64: '$1'", limit_file_path, line));
+  }
+  return Status::OK();
+}
+
+std::string CGroupUtil::DebugString() {
+  string mem_limit_str;
+  int64_t mem_limit;
+  Status status = FindCGroupMemLimit(&mem_limit);
+  if (status.ok()) {
+    mem_limit_str = Substitute("$0", mem_limit);
+  } else {
+    mem_limit_str = status.GetDetail();
+  }
+  return Substitute("Process CGroup Info: memory.limit_in_bytes=$0", mem_limit_str);
+}
+
+} // namespace impala
diff --git a/be/src/util/cgroup-util.h b/be/src/util/cgroup-util.h
new file mode 100644
index 0000000..7a4ccbe
--- /dev/null
+++ b/be/src/util/cgroup-util.h
@@ -0,0 +1,62 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <string>
+#include <utility>
+
+#include "common/status.h"
+
+namespace impala {
+
+class CGroupUtil {
+ public:
+  /// Determines the CGroup memory limit from the current processes' cgroup.
+  /// If the limit is more than INT64_MAX, INT64_MAX is returned (since that is
+  /// effectively unlimited anyway). Does not take into account memory limits
+  /// set on any ancestor CGroups.
+  static Status FindCGroupMemLimit(int64_t* bytes);
+
+  /// Returns a human-readable string with information about CGroups.
+  static std::string DebugString();
+
+ private:
+  friend class CGroupInfo_ErrorHandling_Test;
+  /// Finds the path of the cgroup of 'subsystem' for the current process.
+  /// E.g. FindGlobalCGroup("memory") will return the memory cgroup
+  /// that this process belongs to. This is a path relative to the system-wide root
+  /// cgroup for 'subsystem'.
+  static Status FindGlobalCGroup(const std::string& subsystem, std::string* path);
+
+  /// Returns the absolute path to the CGroup from inside the container.
+  /// E.g. if this process belongs to
+  /// /sys/fs/cgroup/memory/kubepods/burstable/pod-<long unique id>, which is mounted at
+  /// /sys/fs/cgroup/memory inside the container, this function returns
+  /// "/sys/fs/cgroup/memory".
+  static Status FindAbsCGroupPath(const std::string& subsystem, std::string* path);
+
+  /// Figures out the mapping of the cgroup root from the container's point of view to
+  /// the full path relative to the system-wide cgroups outside of the container.
+  /// E.g. /sys/fs/cgroup/memory/kubepods/burstable/pod-<long unique id> may be mounted at
+  /// /sys/fs/cgroup/memory inside the container. In that case this function would return
+  /// ("/sys/fs/cgroup/memory", "kubepods/burstable/pod-<long unique id>").
+  static Status FindCGroupMounts(
+      const std::string& subsystem, std::pair<std::string, std::string>* result);
+};
+} // namespace impala
diff --git a/be/src/util/default-path-handlers.cc b/be/src/util/default-path-handlers.cc
index d2b57f3..f0fbe60 100644
--- a/be/src/util/default-path-handlers.cc
+++ b/be/src/util/default-path-handlers.cc
@@ -27,18 +27,18 @@
 
 #include "common/logging.h"
 #include "rpc/jni-thrift-util.h"
-#include "runtime/mem-tracker.h"
 #include "runtime/exec-env.h"
+#include "runtime/mem-tracker.h"
 #include "service/impala-server.h"
+#include "util/cgroup-util.h"
 #include "util/common-metrics.h"
+#include "util/cpu-info.h"
 #include "util/debug-util.h"
+#include "util/disk-info.h"
+#include "util/jni-util.h"
 #include "util/mem-info.h"
 #include "util/pprof-path-handlers.h"
-#include "util/mem-info.h"
-#include "util/cpu-info.h"
-#include "util/disk-info.h"
 #include "util/process-state-info.h"
-#include "util/jni-util.h"
 
 #include "common/names.h"
 
@@ -266,16 +266,17 @@ void RootHandler(const Webserver::ArgumentMap& args, Document* document) {
   document->AddMember("disk_info", disk_info, document->GetAllocator());
   Value os_info(OsInfo::DebugString().c_str(), document->GetAllocator());
   document->AddMember("os_info", os_info, document->GetAllocator());
-  Value process_state_info(ProcessStateInfo().DebugString().c_str(),
-    document->GetAllocator());
-  document->AddMember("process_state_info", process_state_info,
-    document->GetAllocator());
+  Value process_state_info(
+      ProcessStateInfo().DebugString().c_str(), document->GetAllocator());
+  document->AddMember("process_state_info", process_state_info, document->GetAllocator());
+  Value cgroup_info(CGroupUtil::DebugString().c_str(), document->GetAllocator());
+  document->AddMember("cgroup_info", cgroup_info, document->GetAllocator());
 
   if (CommonMetrics::PROCESS_START_TIME != nullptr) {
-    Value process_start_time(CommonMetrics::PROCESS_START_TIME->GetValue().c_str(),
-      document->GetAllocator());
-    document->AddMember("process_start_time", process_start_time,
-      document->GetAllocator());
+    Value process_start_time(
+        CommonMetrics::PROCESS_START_TIME->GetValue().c_str(), document->GetAllocator());
+    document->AddMember(
+        "process_start_time", process_start_time, document->GetAllocator());
   }
 
   ExecEnv* env = ExecEnv::GetInstance();
diff --git a/be/src/util/proc-info-test.cc b/be/src/util/proc-info-test.cc
index 9e25663..3c7c30c 100644
--- a/be/src/util/proc-info-test.cc
+++ b/be/src/util/proc-info-test.cc
@@ -24,6 +24,8 @@
 
 #include "common/init.h"
 #include "service/fe-support.h"
+#include "testutil/gtest-util.h"
+#include "util/cgroup-util.h"
 #include "util/mem-info.h"
 #include "util/process-state-info.h"
 #include "util/test-info.h"
@@ -39,6 +41,25 @@ TEST(MemInfo, Basic) {
   ASSERT_GT(MemInfo::commit_limit(), 0);
 }
 
+TEST(CGroupInfo, Basic) {
+  int64_t mem_limit;
+  ASSERT_OK(CGroupUtil::FindCGroupMemLimit(&mem_limit));
+  EXPECT_GT(mem_limit, 0);
+}
+
+// Test error handling when cgroup is not present.
+TEST(CGroupInfo, ErrorHandling) {
+  string path;
+  Status err = CGroupUtil::FindGlobalCGroup("fake-cgroup", &path);
+  LOG(INFO) << err.msg().msg();
+  EXPECT_FALSE(err.ok());
+  err = CGroupUtil::FindAbsCGroupPath("fake-cgroup", &path);
+  EXPECT_FALSE(err.ok());
+  pair<string, string> p;
+  err = CGroupUtil::FindCGroupMounts("fake-cgroup", &p);
+  EXPECT_FALSE(err.ok());
+}
+
 TEST(ProcessStateInfo, Basic) {
   ProcessStateInfo process_state_info;
   ASSERT_GE(process_state_info.GetBytes("io/read_bytes"), 0);
diff --git a/be/src/util/process-state-info.cc b/be/src/util/process-state-info.cc
index 1ef8dd1..e72138a 100644
--- a/be/src/util/process-state-info.cc
+++ b/be/src/util/process-state-info.cc
@@ -99,23 +99,6 @@ void ProcessStateInfo::ReadProcIO() {
   if (ioinfo.is_open()) ioinfo.close();
 }
 
-void ProcessStateInfo::ReadProcCgroup() {
-  ifstream cgroupinfo("/proc/self/cgroup", ios::in);
-  string line;
-  while (cgroupinfo.good() && !cgroupinfo.eof()) {
-    getline(cgroupinfo, line);
-    vector<string> fields;
-    split(fields, line, is_any_of(":"), token_compress_on);
-    if (fields.size() < 3) continue;
-    process_state_map_["cgroup/hierarchy_id"] = fields[0];
-    process_state_map_["cgroup/subsystems"] = fields[1];
-    process_state_map_["cgroup/control_group"] = fields[2];
-    break;
-  }
-
-  if (cgroupinfo.is_open()) cgroupinfo.close();
-}
-
 void ProcessStateInfo::ReadProcSched() {
   ifstream schedinfo("/proc/self/sched", ios::in);
   string line;
@@ -170,7 +153,6 @@ ProcessStateInfo::ProcessStateInfo(bool get_extended_metrics)
   ReadProcStatus();
   if (get_extended_metrics) {
     ReadProcIO();
-    ReadProcCgroup();
     ReadProcSched();
     ReadProcFileDescriptorCount();
   }
@@ -216,25 +198,21 @@ string ProcessStateInfo::DebugString() const {
            << PrettyPrinter::Print(GetBytes("io/write_bytes"), TUnit::BYTES) << endl
            << "    Read I/O: " << GetInt64("io/syscr") << endl
            << "    Write I/O: " << GetInt64("io/syscw") << endl
-           << "  CGroups: " << endl
-           << "    Hierarchy: " << GetString("cgroup/hierarchy_id") << endl
-           << "    Subsystems: " << GetString("cgroup/subsystems") <<endl
-           << "    Control Group: " << GetString("cgroup/control_group") << endl
            << "  Schedule: " << endl
            << "    Sum Execute Time: " << GetString("sched/se.sum_exec_runtime") << endl
            << "    Max Wait Time: " << GetString("sched/se.statistics.wait_max") << endl
            << "    Sum Wait Time: " << GetString("sched/se.statistics.wait_sum") << endl
            << "    Wait Count: " << GetInt64("sched/se.statistics.wait_count") << endl
-           << "    Sum I/O Wait Time: "
-           << GetString("sched/se.statistics.iowait_sum") << endl
-           << "    I/O Wait Count: "
-           << GetInt64("sched/se.statistics.iowait_count") << endl
+           << "    Sum I/O Wait Time: " << GetString("sched/se.statistics.iowait_sum")
+           << endl
+           << "    I/O Wait Count: " << GetInt64("sched/se.statistics.iowait_count")
+           << endl
            << "    Wakeup Count with cpu migration: "
            << GetInt64("sched/se.statistics.nr_wakeups_migrate") << endl
            << "    Switches: " << GetInt64("sched/nr_switches") << endl
            << "    Voluntary Switches: " << GetInt("sched/nr_voluntary_switches") << endl
-           << "    Involuntary Switches: "
-           << GetInt("sched/nr_involuntary_switches") << endl
+           << "    Involuntary Switches: " << GetInt("sched/nr_involuntary_switches")
+           << endl
            << "    Process Priority: " << GetInt("sched/prio") << endl
            << "  File Descriptors: " << endl
            << "    Number of File Descriptors: " << GetInt("fd/count") << endl;
diff --git a/be/src/util/process-state-info.h b/be/src/util/process-state-info.h
index 0b7158b..702861d 100644
--- a/be/src/util/process-state-info.h
+++ b/be/src/util/process-state-info.h
@@ -27,8 +27,8 @@
 namespace impala {
 
 /// ProcessStateInfo is an interface to query for process state information
-/// at runtime. This contains information about I/O, Cgroup, Scheduler,
-/// Process status, as well as the file descriptors that belong to the process.
+/// at runtime. This contains information about I/O, Scheduler, Process status,
+/// as well as the file descriptors that belong to the process.
 /// Below are some of the I/O information will be read from /proc/self/io.
 /// io/rchar[int64]: Number of bytes which this task has caused to be read from storage.
 /// io/wchar[int64]: Number of bytes which this task has caused to be written to storage.
@@ -39,12 +39,6 @@ namespace impala {
 /// io/cancelled_write_bytes[int64]: Number of bytes which this process did not write,
 /// by truncating pagecache.
 
-/// Below are some of the Cgroup information that will be read from /proc/self/cgroup.
-/// cgroup/hierarchy_id[int]: Hierarchy ID number.
-/// cgroup/subsystems[string]: Set of subsystems bound to the hierarchy.
-/// cgroup/control_group[string]: Control group in the hierarchy to which the
-/// process belongs.
-
 /// Below are some of the Scheduler information that will be read from /proc/self/sched.
 /// sched/se.sum_exec_runtime[string]: Sum of execute time.
 /// sched/se.statistics.wait_max[string]: Max wait time in ready queue.
@@ -108,9 +102,6 @@ class ProcessStateInfo {
   /// Read I/O info from /proc/self/io.
   void ReadProcIO();
 
-  /// Read cgroup info from /proc/self/cgroup.
-  void ReadProcCgroup();
-
   /// Read schedule info from /proc/self/sched.
   void ReadProcSched();
 
diff --git a/www/root.tmpl b/www/root.tmpl
index 9d73c41..a33df9b 100644
--- a/www/root.tmpl
+++ b/www/root.tmpl
@@ -48,4 +48,8 @@ this page is rendered -->
 
   <h2>Process Info</h2>
   <pre>{{process_state_info}}</pre>
+
+  <h2>CGroup Info</h2>
+  <pre>{{cgroup_info}}</pre>
+
 {{>www/common-footer.tmpl}}