-
Notifications
You must be signed in to change notification settings - Fork 1.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
🐛 Fix e2e test for dockermachinePool #11440
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -46,12 +46,12 @@ import ( | |
// reconcileDockerContainers manages the Docker containers for a MachinePool such that it | ||
// - Ensures the number of up-to-date Docker containers is equal to the MachinePool's desired replica count. | ||
// - Does not delete any containers as that must be triggered in reconcileDockerMachines to ensure node cordon/drain. | ||
// - Create the DockerMachine CR after creating the container. | ||
// | ||
// Providers should similarly create their infrastructure instances and reconcile any additional logic. | ||
func (r *DockerMachinePoolReconciler) reconcileDockerContainers(ctx context.Context, cluster *clusterv1.Cluster, machinePool *expv1.MachinePool, dockerMachinePool *infraexpv1.DockerMachinePool) error { | ||
log := ctrl.LoggerFrom(ctx) | ||
|
||
log.V(2).Info("Reconciling Docker containers", "DockerMachinePool", klog.KObj(dockerMachinePool)) | ||
log.Info("Reconciling Docker containers", "DockerMachinePool", klog.KObj(dockerMachinePool)) | ||
|
||
labelFilters := map[string]string{dockerMachinePoolLabel: dockerMachinePool.Name} | ||
|
||
|
@@ -63,11 +63,17 @@ func (r *DockerMachinePoolReconciler) reconcileDockerContainers(ctx context.Cont | |
matchingMachineCount := len(machinesMatchingInfrastructureSpec(ctx, machines, machinePool, dockerMachinePool)) | ||
numToCreate := int(*machinePool.Spec.Replicas) - matchingMachineCount | ||
for range numToCreate { | ||
log.V(2).Info("Creating a new Docker container for machinePool", "MachinePool", klog.KObj(machinePool)) | ||
log.Info("Creating a new Docker container for machinePool", "MachinePool", klog.KObj(machinePool)) | ||
name := fmt.Sprintf("worker-%s", util.RandomString(6)) | ||
if err := createDockerContainer(ctx, name, cluster, machinePool, dockerMachinePool); err != nil { | ||
return errors.Wrap(err, "failed to create a new docker machine") | ||
} | ||
|
||
log.Info("Creating a new DockerMachine for dockerMachinePool", "DockerMachinePool", klog.KObj(dockerMachinePool)) | ||
dockerMachine := computeDesiredDockerMachine(name, cluster, machinePool, dockerMachinePool, nil) | ||
if err := ssa.Patch(ctx, r.Client, dockerMachinePoolControllerName, dockerMachine); err != nil { | ||
return errors.Wrap(err, "failed to create a new docker machine") | ||
} | ||
} | ||
|
||
return nil | ||
|
@@ -107,15 +113,14 @@ func createDockerContainer(ctx context.Context, name string, cluster *clusterv1. | |
|
||
// reconcileDockerMachines creates and deletes DockerMachines to match the MachinePool's desired number of replicas and infrastructure spec. | ||
// It is responsible for | ||
// - Ensuring each Docker container has an associated DockerMachine by creating one if it doesn't already exist. | ||
// - Ensuring that deletion for Docker container happens by calling delete on the associated Machine so that the node is cordoned/drained and the infrastructure is cleaned up. | ||
// - Deleting DockerMachines referencing a container whose Kubernetes version or custom image no longer matches the spec. | ||
// - Deleting DockerMachines that correspond to a deleted/non-existent Docker container. | ||
// - Deleting DockerMachines when scaling down such that DockerMachines whose owner Machine has the clusterv1.DeleteMachineAnnotation is given priority. | ||
func (r *DockerMachinePoolReconciler) reconcileDockerMachines(ctx context.Context, cluster *clusterv1.Cluster, machinePool *expv1.MachinePool, dockerMachinePool *infraexpv1.DockerMachinePool) error { | ||
log := ctrl.LoggerFrom(ctx) | ||
|
||
log.V(2).Info("Reconciling DockerMachines", "DockerMachinePool", klog.KObj(dockerMachinePool)) | ||
log.Info("Reconciling DockerMachines", "DockerMachinePool", klog.KObj(dockerMachinePool)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's please keep using log levels (also for all other places in this PR) |
||
|
||
dockerMachineList, err := getDockerMachines(ctx, r.Client, *cluster, *machinePool, *dockerMachinePool) | ||
if err != nil { | ||
|
@@ -140,36 +145,13 @@ func (r *DockerMachinePoolReconciler) reconcileDockerMachines(ctx context.Contex | |
} | ||
|
||
// Step 1: | ||
// Create a DockerMachine for each Docker container so we surface the information to the user. Use the same name as the Docker container for the Docker Machine for ease of lookup. | ||
// Providers should iterate through their infrastructure instances and ensure that each instance has a corresponding InfraMachine. | ||
for _, machine := range externalMachines { | ||
if existingMachine, ok := dockerMachineMap[machine.Name()]; ok { | ||
log.V(2).Info("Patching existing DockerMachine", "DockerMachine", klog.KObj(&existingMachine)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are we losing this entire branch? I don't follow how this change solves the problem There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I moved the creation of the DockerMachine with the Container creation here to avoid the loop for creating DockerMachine CR based on the previously created container. logs ex; the dockerMachine is failed to patch as it is deleted , next log is the dockerMachine created (I think dockerMachinePool go to delete same time the patch is happening to create missing dockerMachine) and then we stuck waiting for the machine get created but never happen as the dockermachinePool and machinePool are gone.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
have you considered to prevent creation of new machines in this func when the DockerMachinePool has a deletion timestamp? (it should probably be an if around L155-L161) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Well, the code logic shouldn't allow this to happen (it should go through the delete reconcile). In any case, tying the creation of dockerMachine after the container creation is better implementation to avoid such random execution to happen. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
This is confusing me a little bit (and my lack of knowledge in MP doesn't help) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm too new to CAPD for having a strong opinion, but the concern I have is that now creation only happens if the controller creates a Docker container. If one is created in another way, for instance manually, it won't be reconciled into a If we're fine about this thread, the rest = LGTM. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's strange to me as well, why do we even create dockerMachines ? as far as I understand we use machinePool to not manage individual machines. If there is a special handling for the machinePool machines then we create machienPoolMachine to perform this special handling (not the case for dockerMachienPool and dockerMachine). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
You can find context in https://github.com/kubernetes-sigs/cluster-api/blob/main/docs/proposals/20220209-machinepool-machines.md
Considering this is a test provider, I think the real question is if the current implementation is enough to validate MP implementation in core CAPI (In other words I think it is ok if it isn't perfect, but it should serve to a goal).
it goes down into cache implementation in controller runtime, informers etc, rif https://github.com/kubernetes-sigs/controller-runtime/tree/v0.19.3/pkg/cache In order to get this moving, what I suggest is
(*) Change should be implemented here, and an this is an example of the wait loop we should implement (replace MS with DockerMachines, fixup error management) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks @fabriziopandini for referring to cache implementation. However, what are you referring to is not helping, you can reproduce it by using this repo here just build e2e and then run There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for testing my theory
I don't have yet a good explanation about why this happen, but I would prefer to take time to investigate this properly because CR should prevent those race conditions to happen, and if not, we should root cause and get a fix there. I will try to reproduce locally, but I can't guarantee when I will get to it with the upcoming release and usual EOY rush. cc @mboersma @AndiDog who might be interested in work this area as well |
||
desiredMachine := computeDesiredDockerMachine(machine.Name(), cluster, machinePool, dockerMachinePool, &existingMachine) | ||
if err := ssa.Patch(ctx, r.Client, dockerMachinePoolControllerName, desiredMachine, ssa.WithCachingProxy{Cache: r.ssaCache, Original: &existingMachine}); err != nil { | ||
return errors.Wrapf(err, "failed to update DockerMachine %q", klog.KObj(desiredMachine)) | ||
} | ||
|
||
dockerMachineMap[desiredMachine.Name] = *desiredMachine | ||
} else { | ||
log.V(2).Info("Creating a new DockerMachine for Docker container", "container", machine.Name()) | ||
desiredMachine := computeDesiredDockerMachine(machine.Name(), cluster, machinePool, dockerMachinePool, nil) | ||
if err := ssa.Patch(ctx, r.Client, dockerMachinePoolControllerName, desiredMachine); err != nil { | ||
return errors.Wrap(err, "failed to create a new docker machine") | ||
} | ||
|
||
dockerMachineMap[desiredMachine.Name] = *desiredMachine | ||
} | ||
} | ||
|
||
// Step 2: | ||
// Delete any DockerMachines that correspond to a deleted Docker container. | ||
// Providers should iterate through the InfraMachines to ensure each one still corresponds to an existing infrastructure instance. | ||
// This allows the InfraMachine (and owner Machine) to be deleted and avoid hanging resources when a user deletes an instance out-of-band. | ||
for _, dockerMachine := range dockerMachineMap { | ||
if _, ok := externalMachineMap[dockerMachine.Name]; !ok { | ||
dockerMachine := dockerMachine | ||
log.V(2).Info("Deleting DockerMachine with no underlying infrastructure", "DockerMachine", klog.KObj(&dockerMachine)) | ||
log.Info("Deleting DockerMachine with no underlying infrastructure", "DockerMachine", klog.KObj(&dockerMachine)) | ||
if err := r.deleteMachinePoolMachine(ctx, dockerMachine); err != nil { | ||
return err | ||
} | ||
|
@@ -178,7 +160,7 @@ func (r *DockerMachinePoolReconciler) reconcileDockerMachines(ctx context.Contex | |
} | ||
} | ||
|
||
// Step 3: | ||
// Step 2: | ||
// This handles the scale down/excess replicas case and the case where a rolling upgrade is needed. | ||
// If there are more ready DockerMachines than desired replicas, start to delete the excess DockerMachines such that | ||
// - DockerMachines with an outdated Kubernetes version or custom image are deleted first (i.e. the rolling upgrade). | ||
|
@@ -218,7 +200,7 @@ func (r *DockerMachinePoolReconciler) reconcileDockerMachines(ctx context.Contex | |
for _, dockerMachine := range outdatedMachines { | ||
if overProvisionCount > 0 { | ||
dockerMachine := dockerMachine | ||
log.V(2).Info("Deleting DockerMachine because it is outdated", "DockerMachine", klog.KObj(&dockerMachine)) | ||
log.Info("Deleting DockerMachine because it is outdated", "DockerMachine", klog.KObj(&dockerMachine)) | ||
if err := r.deleteMachinePoolMachine(ctx, dockerMachine); err != nil { | ||
return err | ||
} | ||
|
@@ -231,7 +213,7 @@ func (r *DockerMachinePoolReconciler) reconcileDockerMachines(ctx context.Contex | |
for _, dockerMachine := range readyMachines { | ||
if overProvisionCount > 0 { | ||
dockerMachine := dockerMachine | ||
log.V(2).Info("Deleting DockerMachine because it is an excess replica", "DockerMachine", klog.KObj(&dockerMachine)) | ||
log.Info("Deleting DockerMachine because it is an excess replica", "DockerMachine", klog.KObj(&dockerMachine)) | ||
if err := r.deleteMachinePoolMachine(ctx, dockerMachine); err != nil { | ||
return err | ||
} | ||
|
@@ -272,6 +254,7 @@ func computeDesiredDockerMachine(name string, cluster *clusterv1.Cluster, machin | |
Name: dockerMachinePool.Name, | ||
UID: dockerMachinePool.UID, | ||
})) | ||
|
||
dockerMachine.Labels[clusterv1.ClusterNameLabel] = cluster.Name | ||
dockerMachine.Labels[clusterv1.MachinePoolNameLabel] = format.MustFormatValue(machinePool.Name) | ||
|
||
|
@@ -288,7 +271,7 @@ func (r *DockerMachinePoolReconciler) deleteMachinePoolMachine(ctx context.Conte | |
} | ||
// util.GetOwnerMachine() returns a nil Machine without error if there is no Machine kind in the ownerRefs, so we must verify that machine is not nil. | ||
if machine == nil { | ||
log.V(2).Info("No owner Machine exists for DockerMachine", "dockerMachine", klog.KObj(&dockerMachine)) | ||
log.Info("No owner Machine exists for DockerMachine", "dockerMachine", klog.KObj(&dockerMachine)) | ||
|
||
// If the DockerMachine does not have an owner Machine, do not attempt to delete the DockerMachine as the MachinePool controller will create the | ||
// Machine and we want to let it catch up. If we are too hasty to delete, that introduces a race condition where the DockerMachine could be deleted | ||
|
@@ -297,7 +280,8 @@ func (r *DockerMachinePoolReconciler) deleteMachinePoolMachine(ctx context.Conte | |
// In the case where the MachinePool is being deleted and the Machine will never come online, the DockerMachine will be deleted via its ownerRef to the | ||
// DockerMachinePool, so that is covered as well. | ||
|
||
return nil | ||
// Returning error as we need the dockerMachine not to proceed. | ||
return errors.New("No owner Machine exists for DockerMachine") | ||
} | ||
|
||
log.Info("Deleting Machine for DockerMachine", "Machine", klog.KObj(machine), "DockerMachine", klog.KObj(&dockerMachine)) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The delete call seems unchanged
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I kept the delete cause if the container get deleted out side of capi. It has to reflect what is exist in the infrastructure. Also it handle the replica scale down use-case if something wrong happen.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
But then we probably shouldn't drop the comment mentioning the deletion?