You are viewing a plain text version of this content. The canonical link for it is here.
Posted to reviews@yunikorn.apache.org by GitBox <gi...@apache.org> on 2022/01/25 01:07:30 UTC

[GitHub] [incubator-yunikorn-k8shim] yangwwei commented on a change in pull request #359: [YUNIKORN-1041] Fix binding of dynamic volumes to pod

yangwwei commented on a change in pull request #359:
URL: https://github.com/apache/incubator-yunikorn-k8shim/pull/359#discussion_r791274687



##########
File path: pkg/cache/context.go
##########
@@ -365,19 +366,68 @@ func (ctx *Context) bindPodVolumes(pod *v1.Pod) error {
 				zap.String("podName", pod.Name))
 		} else {
 			log.Logger().Info("Binding Pod Volumes", zap.String("podName", pod.Name))
-			boundClaims, claimsToBind, _, err := ctx.apiProvider.GetAPIs().VolumeBinder.GetPodVolumes(assumedPod)
+			boundClaims, claimsToBind, unboundClaimsImmediate, err := ctx.apiProvider.GetAPIs().VolumeBinder.GetPodVolumes(assumedPod)
 			if err != nil {
+				log.Logger().Error("Failed to get pod volumes",
+					zap.String("podName", assumedPod.Name),
+					zap.Error(err))
+				return err
+			}
+			if len(unboundClaimsImmediate) > 0 {
+				err = fmt.Errorf("pod %s has unbound immediate claims", pod.Name)
+				log.Logger().Error("Pod has unbound immediate claims",
+					zap.String("podName", assumedPod.Name),
+					zap.Error(err))
 				return err
 			}
 			node, err := ctx.schedulerCache.GetNodeInfo(assumedPod.Spec.NodeName)
 			if err != nil {
+				log.Logger().Error("Failed to get node info",
+					zap.String("podName", assumedPod.Name),
+					zap.String("nodeName", assumedPod.Spec.NodeName),
+					zap.Error(err))
+				return err
+			}
+			volumes, reasons, err := ctx.apiProvider.GetAPIs().VolumeBinder.FindPodVolumes(assumedPod, boundClaims, claimsToBind, node)
+			if err != nil {
+				log.Logger().Error("Failed to find pod volumes",
+					zap.String("podName", assumedPod.Name),
+					zap.String("nodeName", assumedPod.Spec.NodeName),
+					zap.Int("claimsToBind", len(claimsToBind)),
+					zap.Error(err))
 				return err
 			}
-			volumes, _, err := ctx.apiProvider.GetAPIs().VolumeBinder.FindPodVolumes(assumedPod, boundClaims, claimsToBind, node)
+			if len(reasons) > 0 {
+				sReasons := make([]string, 0)
+				for _, reason := range reasons {
+					sReasons = append(sReasons, string(reason))
+				}
+				sReason := strings.Join(sReasons, ", ")
+				err = fmt.Errorf("pod %s has conflicting volume claims: %s", pod.Name, sReason)
+				log.Logger().Error("Pod has conflicting volume claims",
+					zap.String("podName", assumedPod.Name),
+					zap.String("nodeName", assumedPod.Spec.NodeName),
+					zap.Int("claimsToBind", len(claimsToBind)),
+					zap.Error(err))
+				return err
+			}

Review comment:
       Good catch.
   
   ```
   volumes, reasons, err := ctx.apiProvider.GetAPIs().VolumeBinder.FindPodVolumes(assumedPod, boundClaims, claimsToBind, node)
   ```
   Seems like there will be a case where `err==nil` but `len(reasons) > 0`
   
   > // It returns an error when something went wrong or a list of reasons why the node is
   	// (currently) not usable for the pod.
   

##########
File path: pkg/cache/context.go
##########
@@ -365,19 +366,68 @@ func (ctx *Context) bindPodVolumes(pod *v1.Pod) error {
 				zap.String("podName", pod.Name))
 		} else {
 			log.Logger().Info("Binding Pod Volumes", zap.String("podName", pod.Name))
-			boundClaims, claimsToBind, _, err := ctx.apiProvider.GetAPIs().VolumeBinder.GetPodVolumes(assumedPod)
+			boundClaims, claimsToBind, unboundClaimsImmediate, err := ctx.apiProvider.GetAPIs().VolumeBinder.GetPodVolumes(assumedPod)
 			if err != nil {
+				log.Logger().Error("Failed to get pod volumes",
+					zap.String("podName", assumedPod.Name),
+					zap.Error(err))
+				return err
+			}
+			if len(unboundClaimsImmediate) > 0 {
+				err = fmt.Errorf("pod %s has unbound immediate claims", pod.Name)
+				log.Logger().Error("Pod has unbound immediate claims",
+					zap.String("podName", assumedPod.Name),
+					zap.Error(err))
 				return err
 			}
 			node, err := ctx.schedulerCache.GetNodeInfo(assumedPod.Spec.NodeName)
 			if err != nil {
+				log.Logger().Error("Failed to get node info",
+					zap.String("podName", assumedPod.Name),
+					zap.String("nodeName", assumedPod.Spec.NodeName),
+					zap.Error(err))
+				return err
+			}
+			volumes, reasons, err := ctx.apiProvider.GetAPIs().VolumeBinder.FindPodVolumes(assumedPod, boundClaims, claimsToBind, node)
+			if err != nil {
+				log.Logger().Error("Failed to find pod volumes",
+					zap.String("podName", assumedPod.Name),
+					zap.String("nodeName", assumedPod.Spec.NodeName),
+					zap.Int("claimsToBind", len(claimsToBind)),
+					zap.Error(err))
 				return err
 			}
-			volumes, _, err := ctx.apiProvider.GetAPIs().VolumeBinder.FindPodVolumes(assumedPod, boundClaims, claimsToBind, node)
+			if len(reasons) > 0 {
+				sReasons := make([]string, 0)
+				for _, reason := range reasons {
+					sReasons = append(sReasons, string(reason))
+				}
+				sReason := strings.Join(sReasons, ", ")
+				err = fmt.Errorf("pod %s has conflicting volume claims: %s", pod.Name, sReason)
+				log.Logger().Error("Pod has conflicting volume claims",
+					zap.String("podName", assumedPod.Name),
+					zap.String("nodeName", assumedPod.Spec.NodeName),
+					zap.Int("claimsToBind", len(claimsToBind)),
+					zap.Error(err))
+				return err
+			}
+			if volumes.StaticBindings == nil {
+				// convert nil to empty array
+				volumes.StaticBindings = make([]*scheduling.BindingInfo, 0)
+			}
+			if volumes.DynamicProvisions == nil {
+				// convert nil to empty array
+				volumes.DynamicProvisions = make([]*v1.PersistentVolumeClaim, 0)
+			}

Review comment:
       why does this fix the problem? 
   I did not see anywhere to complain NIL value

##########
File path: pkg/cache/context.go
##########
@@ -365,19 +366,68 @@ func (ctx *Context) bindPodVolumes(pod *v1.Pod) error {
 				zap.String("podName", pod.Name))
 		} else {
 			log.Logger().Info("Binding Pod Volumes", zap.String("podName", pod.Name))
-			boundClaims, claimsToBind, _, err := ctx.apiProvider.GetAPIs().VolumeBinder.GetPodVolumes(assumedPod)
+			boundClaims, claimsToBind, unboundClaimsImmediate, err := ctx.apiProvider.GetAPIs().VolumeBinder.GetPodVolumes(assumedPod)
 			if err != nil {
+				log.Logger().Error("Failed to get pod volumes",
+					zap.String("podName", assumedPod.Name),
+					zap.Error(err))
+				return err
+			}
+			if len(unboundClaimsImmediate) > 0 {
+				err = fmt.Errorf("pod %s has unbound immediate claims", pod.Name)
+				log.Logger().Error("Pod has unbound immediate claims",
+					zap.String("podName", assumedPod.Name),
+					zap.Error(err))

Review comment:
       Is this the right place to check the unbound immediate claims? Maybe too late? The PVC relies on a PV controller to do the binding, before this happens, the default scheduler [skips scheduling the pod] in the PreFilter phase (https://github.com/kubernetes/kubernetes/blob/c592bd40f2df941aa4ea364592ce92fd5c669bfc/pkg/scheduler/framework/plugins/volumebinding/volume_binding.go#L185-L187). Our code here is pretty late, and returning an error means we failed to schedule this pod, and user will get a "PodVolumesBindFailure" error in the pod. Would it make sense to do this check [here](https://github.com/apache/incubator-yunikorn-k8shim/blob/799a42ed9b247815fd344f70e0df27d6b83f90d2/pkg/cache/application.go#L414)? This is a sanity check to ensure that a pod is ready for scheduling.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscribe@yunikorn.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org