You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by jo...@apache.org on 2018/11/14 21:23:20 UTC

[mesos] branch 1.7.x updated (1cdb750 -> 57958fb)

This is an automated email from the ASF dual-hosted git repository.

josephwu pushed a change to branch 1.7.x
in repository https://gitbox.apache.org/repos/asf/mesos.git.


    from 1cdb750  Updated `launchHelper` to use launcher's `whitelistFds` parameter.
     new 05dae62  Added validation of cache files to the URI Fetcher.
     new 1f4e317  Added `FetcherCacheTest.LocalCachedMissing` test.
     new 57958fb  Added MESOS-7574 to 1.7.1 CHANGELOG.

The 3 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 CHANGELOG                                   |  1 +
 src/slave/containerizer/fetcher.cpp         | 40 +++++++++++++++++++++++++++++
 src/slave/containerizer/fetcher_process.hpp |  3 +++
 src/tests/fetcher_cache_tests.cpp           | 39 ++++++++++++++++++++++++++++
 4 files changed, 83 insertions(+)


[mesos] 01/03: Added validation of cache files to the URI Fetcher.

Posted by jo...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

josephwu pushed a commit to branch 1.7.x
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit 05dae62e65cd1eadf5eb577914412bc7b947a9c8
Author: Andrei Budnik <ab...@mesosphere.com>
AuthorDate: Wed Nov 14 11:51:24 2018 -0800

    Added validation of cache files to the URI Fetcher.
    
    Previously, missing cache files could lead to task launch failures.
    This patch introduces validation of cache files which happens each
    time a downloaded cache entry is requested via the `get()` method.
    If validation fails, `get()` returns `None()`, so that the fetcher
    retries downloading URI. Currently, we only check the existence of
    the cache file during validation.
    
    Review: https://reviews.apache.org/r/69171/
---
 src/slave/containerizer/fetcher.cpp         | 40 +++++++++++++++++++++++++++++
 src/slave/containerizer/fetcher_process.hpp |  3 +++
 2 files changed, 43 insertions(+)

diff --git a/src/slave/containerizer/fetcher.cpp b/src/slave/containerizer/fetcher.cpp
index 17f5388..fe7e139 100644
--- a/src/slave/containerizer/fetcher.cpp
+++ b/src/slave/containerizer/fetcher.cpp
@@ -1013,6 +1013,22 @@ FetcherProcess::Cache::get(
 
   Option<shared_ptr<Entry>> entry = table.get(key);
   if (entry.isSome()) {
+    // The FetcherProcess will always remove a failed download
+    // synchronously after marking this future as failed.
+    CHECK(!entry.get()->completion().isFailed());
+
+    // Validate the cache file, if it has been downloaded.
+    if (entry.get()->completion().isReady()) {
+      Try<Nothing> validation = validate(entry.get());
+      if (validation.isError()) {
+        LOG(WARNING) << "Validation failed: '" + validation.error() +
+                        "'. Removing cache entry...";
+
+        remove(entry.get());
+        return None();
+      }
+    }
+
     // Refresh the cache entry by moving it to the back of lruSortedEntries.
     lruSortedEntries.remove(entry.get());
     lruSortedEntries.push_back(entry.get());
@@ -1048,6 +1064,7 @@ bool FetcherProcess::Cache::contains(
 //   (1) We failed to determine its prospective cache file size.
 //   (2) We failed to download it when invoking the mesos-fetcher.
 //   (3) We're evicting it to make room for another entry.
+//   (4) We failed to validate the cache file.
 //
 // In (1) and (2) the contract is that we'll have failed the entry's
 // future before we call remove, so the entry's future should no
@@ -1059,6 +1076,9 @@ bool FetcherProcess::Cache::contains(
 // reference count and therefore the future must either be ready or
 // failed in which case this is just case (1) above.
 //
+// In (4) we explicitly only validate a cache file if the future is
+// ready (i.e., the file has been downloaded).
+//
 // NOTE: It is not necessarily the case that this cache entry has
 // zero references because there might be some waiters on the
 // downloading of this entry which haven't been able to run and find
@@ -1161,6 +1181,26 @@ Try<Nothing> FetcherProcess::Cache::reserve(
 }
 
 
+Try<Nothing> FetcherProcess::Cache::validate(
+    const std::shared_ptr<Cache::Entry>& entry)
+{
+    VLOG(1) << "Validating cache entry '" << entry->key
+            << "' with filename: " << entry->filename;
+
+    if (!os::exists(entry->path().string())) {
+      return Error("Cache file does not exist: " + entry->filename);
+    }
+
+    // TODO(abudnik): Consider adding validation of the cache file by either:
+    //   1. Comparing a known file checksum with the actual checksum of the file
+    //      stored on disk.
+    //   2. Reading the whole file by chunks. Many filesystems detect data
+    //      corruptions when reading file's data. As a positive side effect,
+    //      the file's data will be loaded into the page cache.
+    return Nothing();
+}
+
+
 Try<Nothing> FetcherProcess::Cache::adjust(
     const shared_ptr<FetcherProcess::Cache::Entry>& entry)
 {
diff --git a/src/slave/containerizer/fetcher_process.hpp b/src/slave/containerizer/fetcher_process.hpp
index 56ed6e5..0d4b787 100644
--- a/src/slave/containerizer/fetcher_process.hpp
+++ b/src/slave/containerizer/fetcher_process.hpp
@@ -190,6 +190,9 @@ public:
     size_t size() const;
 
   private:
+    // Returns whether the cache file exists and not corrupted.
+    Try<Nothing> validate(const std::shared_ptr<Cache::Entry>& entry);
+
     // Maximum storable number of bytes in the cache directory.
     const Bytes space;
 


[mesos] 03/03: Added MESOS-7574 to 1.7.1 CHANGELOG.

Posted by jo...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

josephwu pushed a commit to branch 1.7.x
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit 57958fbb9c877f87489cce24830ce88155e20da9
Author: Joseph Wu <jo...@apache.org>
AuthorDate: Wed Nov 14 13:21:20 2018 -0800

    Added MESOS-7574 to 1.7.1 CHANGELOG.
---
 CHANGELOG | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG b/CHANGELOG
index e487b89..5d3bc99 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -7,6 +7,7 @@ Release Notes - Mesos - Version 1.7.1 (WIP)
     cycle time in some benchmarks by 80%.
 
 ** Bug
+  * [MESOS-7474] - Mesos fetcher cache doesn't retry when missed.
   * [MESOS-8545] - AgentAPIStreamingTest.AttachInputToNestedContainerSession is flaky.
   * [MESOS-8907] - Docker image fetcher fails with HTTP/2.
   * [MESOS-8978] - Command executor calling setsid breaks the tty support.


[mesos] 02/03: Added `FetcherCacheTest.LocalCachedMissing` test.

Posted by jo...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

josephwu pushed a commit to branch 1.7.x
in repository https://gitbox.apache.org/repos/asf/mesos.git

commit 1f4e317f3e78a0891759c4990a202a81a0c63f2b
Author: Andrei Budnik <ab...@mesosphere.com>
AuthorDate: Wed Nov 14 11:53:33 2018 -0800

    Added `FetcherCacheTest.LocalCachedMissing` test.
    
    This test verifies that the fetcher retries downloading URI when the
    cache file is missing.
    
    Review: https://reviews.apache.org/r/69172/
---
 src/tests/fetcher_cache_tests.cpp | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/src/tests/fetcher_cache_tests.cpp b/src/tests/fetcher_cache_tests.cpp
index 22a7c0f..8766ffe 100644
--- a/src/tests/fetcher_cache_tests.cpp
+++ b/src/tests/fetcher_cache_tests.cpp
@@ -697,6 +697,45 @@ TEST_F(FetcherCacheTest, LocalCached)
 }
 
 
+// This test launches a task with enabled cache, then removes all cached files,
+// then attempts to launch another task with the same URIs as the first task.
+// We expect that the fetcher retries to download all the artifacts when cached
+// files are missing.
+TEST_F(FetcherCacheTest, LocalCachedMissing)
+{
+  startSlave();
+  driver->start();
+
+  for (size_t i = 0; i < 2; i++) {
+    CommandInfo::URI uri;
+    uri.set_value(commandPath);
+    uri.set_executable(true);
+    uri.set_cache(true);
+
+    CommandInfo commandInfo;
+    commandInfo.set_value("./" + COMMAND_NAME + " " + taskName(i));
+    commandInfo.add_uris()->CopyFrom(uri);
+
+    const Try<Task> task = launchTask(commandInfo, i);
+    ASSERT_SOME(task);
+
+    AWAIT_READY(awaitFinished(task.get()));
+
+    const string path = path::join(task->runDirectory.string(), COMMAND_NAME);
+    EXPECT_TRUE(isExecutable(path));
+    EXPECT_TRUE(os::exists(path + taskName(i)));
+
+    EXPECT_EQ(1u, fetcherProcess->cacheSize());
+    ASSERT_SOME(fetcherProcess->cacheFiles());
+    EXPECT_EQ(1u, fetcherProcess->cacheFiles()->size());
+
+    verifyCacheMetrics();
+
+    EXPECT_SOME(os::rm(fetcherProcess->cacheFiles()->front()));
+  }
+}
+
+
 TEST_F(FetcherCacheTest, CachedCustomFilename)
 {
   startSlave();