You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@openwhisk.apache.org by GitBox <gi...@apache.org> on 2022/09/21 05:08:22 UTC

[GitHub] [openwhisk] style95 commented on a diff in pull request #5326: Fix Orphaned Container Edge Case In Paused State of Container Proxy

style95 commented on code in PR #5326:
URL: https://github.com/apache/openwhisk/pull/5326#discussion_r976038644


##########
tests/src/test/scala/org/apache/openwhisk/core/containerpool/v2/test/FunctionPullingContainerProxyTests.scala:
##########
@@ -1036,6 +1048,84 @@ class FunctionPullingContainerProxyTests
     }
   }
 
+  it should "destroy container proxy when stopping due to timeout and getting live count fails" in within(timeout) {
+    val authStore = mock[ArtifactWhiskAuthStore]
+    val namespaceBlacklist: NamespaceBlacklist = new NamespaceBlacklist(authStore)
+    val get = getWhiskAction(Future(action.toWhiskAction))
+    val dataManagementService = TestProbe()
+    val container = new TestContainer
+    val factory = createFactory(Future.successful(container))
+    val acker = createAcker()
+    val store = createStore
+    val collector = createCollector()
+    val counter = getLiveContainerCountFailFirstCall(2)

Review Comment:
   👍 



##########
tests/src/test/scala/org/apache/openwhisk/core/containerpool/v2/test/FunctionPullingContainerProxyTests.scala:
##########
@@ -1036,6 +1048,84 @@ class FunctionPullingContainerProxyTests
     }
   }
 
+  it should "destroy container proxy when stopping due to timeout and getting live count fails" in within(timeout) {
+    val authStore = mock[ArtifactWhiskAuthStore]
+    val namespaceBlacklist: NamespaceBlacklist = new NamespaceBlacklist(authStore)
+    val get = getWhiskAction(Future(action.toWhiskAction))
+    val dataManagementService = TestProbe()
+    val container = new TestContainer
+    val factory = createFactory(Future.successful(container))
+    val acker = createAcker()
+    val store = createStore
+    val collector = createCollector()
+    val counter = getLiveContainerCountFailFirstCall(2)
+    val limit = getWarmedContainerLimit(Future.successful((1, 10.seconds)))
+    val (client, clientFactory) = testClient
+
+    val probe = TestProbe()
+    val machine =
+      probe.childActorOf(
+        FunctionPullingContainerProxy
+          .props(
+            factory,
+            entityStore,
+            namespaceBlacklist,
+            get,
+            dataManagementService.ref,
+            clientFactory,
+            acker,
+            store,
+            collector,
+            counter,
+            limit,
+            InvokerInstanceId(0, userMemory = defaultUserMemory),
+            invokerHealthManager.ref,
+            poolConfig,
+            timeoutConfig))
+
+    registerCallback(machine, probe)
+    probe watch machine
+
+    machine ! Initialize(invocationNamespace.asString, fqn, action, schedulerHost, rpcPort, messageTransId)
+    probe.expectMsg(Transition(machine, Uninitialized, CreatingClient))
+    client.expectMsg(StartClient)
+    client.send(machine, ClientCreationCompleted())
+
+    probe.expectMsg(Transition(machine, CreatingClient, ClientCreated))
+    expectInitialized(probe)
+    client.expectMsg(RequestActivation())
+    client.send(machine, message)
+
+    probe.expectMsg(Transition(machine, ClientCreated, Running))
+    client.expectMsg(ContainerWarmed)
+    client.expectMsgPF() {
+      case RequestActivation(Some(_), None) => true
+    }
+
+    machine ! StateTimeout
+    client.send(machine, RetryRequestActivation)
+    probe.expectMsg(Transition(machine, Running, Pausing))
+    probe.expectMsgType[ContainerIsPaused]
+    probe.expectMsg(Transition(machine, Pausing, Paused))
+
+    machine ! StateTimeout
+    client.expectMsg(StopClientProxy)
+    probe.expectMsgAllOf(ContainerRemoved(true), Transition(machine, Paused, Removing))

Review Comment:
   Can we add a check to see if `DataManagementService` properly receives the clean-up messages?



##########
core/invoker/src/main/scala/org/apache/openwhisk/core/containerpool/v2/FunctionPullingContainerProxy.scala:
##########
@@ -732,7 +735,12 @@ class FunctionPullingContainerProxy(
         data.action.fullyQualifiedName(false),
         data.action.rev,
         Some(data.clientProxy))
-
+    case Event(t: FailureMessage, data: WarmData) =>
+      logging.error(
+        this,
+        s"Failed to determine whether to keep or remove container on pause timeout for ${data.container.containerId}, retrying. Caused by: $t")
+      startSingleTimer(DetermineKeepContainer.toString, DetermineKeepContainer, 1.second)

Review Comment:
   Is there any reason to start the timer after 1 second?
   Since it will delay the deletion of the ETCD key for the problematic container, another request heading to this container can still come during that time.
   The request will be rescheduled, but it will anyway also delay container creation.
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@openwhisk.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org