Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🌱 Priorities machine with remediate-machine anotation when selecting the next machine to be remediated #11495

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion api/v1beta1/common_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ const (
// MachineSkipRemediationAnnotation is the annotation used to mark the machines that should not be considered for remediation by MachineHealthCheck reconciler.
MachineSkipRemediationAnnotation = "cluster.x-k8s.io/skip-remediation"

// RemediateMachineAnnotation is the annotation used to mark machines that should be remediated by MachineHealthCheck reconciler.
// RemediateMachineAnnotation request the MachineHealthCheck reconciler to mark a Machine as unhealthy. CAPI builtin remediation will prioritize Machines with the annotation to be remediated.
RemediateMachineAnnotation = "cluster.x-k8s.io/remediate-machine"

// MachineSetSkipPreflightChecksAnnotation is the annotation used to provide a comma-separated list of
Expand Down
11 changes: 9 additions & 2 deletions controlplane/kubeadm/internal/controllers/remediation.go
Original file line number Diff line number Diff line change
Expand Up @@ -336,9 +336,16 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C
return ctrl.Result{Requeue: true}, nil
}

// Gets the machine to be remediated, which is the oldest machine marked as unhealthy not yet provisioned (if any)
// or the oldest machine marked as unhealthy.
// Gets the machine which is marked as unhealthy and to be remediated, in the following order
// Oldest machine with RemediateMachineAnnotation annotation, if any
// Oldest machine not yet provisioned, if any
// Oldest machine marked as unhealthy.
func getMachineToBeRemediated(unhealthyMachines collections.Machines) *clusterv1.Machine {
annotatedMachine := unhealthyMachines.Filter(collections.HasAnnotationKey(clusterv1.RemediateMachineAnnotation)).Oldest()
Karthik-K-N marked this conversation as resolved.
Show resolved Hide resolved
if annotatedMachine != nil {
return annotatedMachine
}

machineToBeRemediated := unhealthyMachines.Filter(collections.Not(collections.HasNode())).Oldest()
if machineToBeRemediated == nil {
machineToBeRemediated = unhealthyMachines.Oldest()
Expand Down
43 changes: 43 additions & 0 deletions controlplane/kubeadm/internal/controllers/remediation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,49 @@ import (
)

func TestGetMachineToBeRemediated(t *testing.T) {
t.Run("returns the oldest machine with RemediateMachineAnnotation even if there are provisioning machines", func(t *testing.T) {
g := NewWithT(t)

ns, err := env.CreateNamespace(ctx, "ns1")
g.Expect(err).ToNot(HaveOccurred())
defer func() {
g.Expect(env.Cleanup(ctx, ns)).To(Succeed())
}()

m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed())
m2 := createMachine(ctx, g, ns.Name, "m2-unhealthy-", withMachineHealthCheckFailed())
m3 := createMachine(ctx, g, ns.Name, "m3-unhealthy-", withMachineHealthCheckFailed(), withoutNodeRef())
m4 := createMachine(ctx, g, ns.Name, "m4-unhealthy-", withMachineHealthCheckFailed(), withoutNodeRef())

m1.SetAnnotations(map[string]string{clusterv1.RemediateMachineAnnotation: ""})
m2.SetAnnotations(map[string]string{clusterv1.RemediateMachineAnnotation: ""})

unhealthyMachines := collections.FromMachines(m1, m2, m3, m4)
Karthik-K-N marked this conversation as resolved.
Show resolved Hide resolved

g.Expect(getMachineToBeRemediated(unhealthyMachines).Name).To(HavePrefix("m1-unhealthy-"))
})

t.Run("returns the oldest machine with RemediateMachineAnnotation if there are no provisioning machines", func(t *testing.T) {
g := NewWithT(t)

ns, err := env.CreateNamespace(ctx, "ns1")
g.Expect(err).ToNot(HaveOccurred())
defer func() {
g.Expect(env.Cleanup(ctx, ns)).To(Succeed())
}()

m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed())
m2 := createMachine(ctx, g, ns.Name, "m2-unhealthy-", withMachineHealthCheckFailed())
m3 := createMachine(ctx, g, ns.Name, "m3-unhealthy-", withMachineHealthCheckFailed())

m1.SetAnnotations(map[string]string{clusterv1.RemediateMachineAnnotation: ""})
m2.SetAnnotations(map[string]string{clusterv1.RemediateMachineAnnotation: ""})

unhealthyMachines := collections.FromMachines(m1, m2, m3)
Karthik-K-N marked this conversation as resolved.
Show resolved Hide resolved

g.Expect(getMachineToBeRemediated(unhealthyMachines).Name).To(HavePrefix("m1-unhealthy-"))
})

t.Run("returns the oldest machine if there are no provisioning machines", func(t *testing.T) {
g := NewWithT(t)

Expand Down
37 changes: 26 additions & 11 deletions internal/controllers/machineset/machineset_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -1334,10 +1334,10 @@ func (r *Reconciler) reconcileUnhealthyMachines(ctx context.Context, s *scope) (

// Calculates the Machines to be remediated.
// Note: Machines already deleting are not included, there is no need to trigger remediation for them again.
machinesToRemediate := collections.FromMachines(machines...).Filter(collections.IsUnhealthyAndOwnerRemediated, collections.Not(collections.HasDeletionTimestamp)).UnsortedList()
remediateMachines := collections.FromMachines(machines...).Filter(collections.IsUnhealthyAndOwnerRemediated, collections.Not(collections.HasDeletionTimestamp)).UnsortedList()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If possible, I would keep the existing name, it seems more clear to me


// If there are no machines to remediate return early.
if len(machinesToRemediate) == 0 {
if len(remediateMachines) == 0 {
return ctrl.Result{}, nil
}

Expand All @@ -1350,7 +1350,7 @@ func (r *Reconciler) reconcileUnhealthyMachines(ctx context.Context, s *scope) (
if isDeploymentChild(ms) {
if owner.Annotations[clusterv1.RevisionAnnotation] != ms.Annotations[clusterv1.RevisionAnnotation] {
// MachineSet is part of a MachineDeployment but isn't the current revision, no remediations allowed.
if err := patchMachineConditions(ctx, r.Client, machinesToRemediate, metav1.Condition{
if err := patchMachineConditions(ctx, r.Client, remediateMachines, metav1.Condition{
Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: clusterv1.MachineSetMachineCannotBeRemediatedV1Beta2Reason,
Expand Down Expand Up @@ -1392,8 +1392,8 @@ func (r *Reconciler) reconcileUnhealthyMachines(ctx context.Context, s *scope) (
// Check if we can remediate any machines.
if maxInFlight <= 0 {
// No tokens available to remediate machines.
log.V(3).Info("Remediation strategy is set, and maximum in flight has been reached", "machinesToBeRemediated", len(machinesToRemediate))
if err := patchMachineConditions(ctx, r.Client, machinesToRemediate, metav1.Condition{
log.V(3).Info("Remediation strategy is set, and maximum in flight has been reached", "machinesToBeRemediated", len(remediateMachines))
if err := patchMachineConditions(ctx, r.Client, remediateMachines, metav1.Condition{
Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: clusterv1.MachineSetMachineRemediationDeferredV1Beta2Reason,
Expand All @@ -1404,12 +1404,7 @@ func (r *Reconciler) reconcileUnhealthyMachines(ctx context.Context, s *scope) (
return ctrl.Result{}, nil
}

// Sort the machines from newest to oldest.
// We are trying to remediate machines failing to come up first because
// there is a chance that they are not hosting any workloads (minimize disruption).
sort.SliceStable(machinesToRemediate, func(i, j int) bool {
return machinesToRemediate[i].CreationTimestamp.After(machinesToRemediate[j].CreationTimestamp.Time)
})
machinesToRemediate := getMachinesToRemediateInOrder(remediateMachines)

// Check if we should limit the in flight operations.
if len(machinesToRemediate) > maxInFlight {
Expand Down Expand Up @@ -1578,3 +1573,23 @@ func (r *Reconciler) reconcileExternalTemplateReference(ctx context.Context, clu

return false, patchHelper.Patch(ctx, obj)
}

// Returns the machines to be remediated in the following order
// Machines with RemediateMachineAnnotation annotation if any,
// Machines failing to come up first because
// there is a chance that they are not hosting any workloads (minimize disruption).
func getMachinesToRemediateInOrder(remediateMachines []*clusterv1.Machine) []*clusterv1.Machine {
// From machines to remediate select the machines with RemediateMachineAnnotation annotation.
annotatedMachines := collections.FromMachines(remediateMachines...).Filter(collections.HasAnnotationKey(clusterv1.RemediateMachineAnnotation))

// Filter out the machines which are unique (machines which are not in annotated machines list)
uniqueMachines := collections.FromMachines(remediateMachines...).Difference(annotatedMachines).UnsortedList()

// Sort the machines from newest to oldest.
sort.SliceStable(uniqueMachines, func(i, j int) bool {
return uniqueMachines[i].CreationTimestamp.After(uniqueMachines[j].CreationTimestamp.Time)
})

// combine the annotated machines along with machines to remediate.
return append(annotatedMachines.UnsortedList(), uniqueMachines...)
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about this alternative implementation

Suggested change
// Returns the machines to be remediated in the following order
// Machines with RemediateMachineAnnotation annotation if any,
// Machines failing to come up first because
// there is a chance that they are not hosting any workloads (minimize disruption).
func getMachinesToRemediateInOrder(remediateMachines []*clusterv1.Machine) []*clusterv1.Machine {
// From machines to remediate select the machines with RemediateMachineAnnotation annotation.
annotatedMachines := collections.FromMachines(remediateMachines...).Filter(collections.HasAnnotationKey(clusterv1.RemediateMachineAnnotation))
// Filter out the machines which are unique (machines which are not in annotated machines list)
uniqueMachines := collections.FromMachines(remediateMachines...).Difference(annotatedMachines).UnsortedList()
// Sort the machines from newest to oldest.
sort.SliceStable(uniqueMachines, func(i, j int) bool {
return uniqueMachines[i].CreationTimestamp.After(uniqueMachines[j].CreationTimestamp.Time)
})
// combine the annotated machines along with machines to remediate.
return append(annotatedMachines.UnsortedList(), uniqueMachines...)
}
// Returns the machines to be remediated in the following order
// - Machines with RemediateMachineAnnotation annotation if any,
// - Machines failing to come up first because
// there is a chance that they are not hosting any workloads (minimize disruption).
func sortMachinesToRemediate(machines []*clusterv1.Machine) {
sort.SliceStable(machines, func(i, j int) bool {
_, iHasRemediateAnnotation := machines[i].Annotations[clusterv1.RemediateMachineAnnotation]
_, jHasRemediateAnnotation := machines[j].Annotations[clusterv1.RemediateMachineAnnotation]
if iHasRemediateAnnotation && !jHasRemediateAnnotation {
return true
}
if !iHasRemediateAnnotation && jHasRemediateAnnotation {
return false
}
return machines[i].CreationTimestamp.After(machines[j].CreationTimestamp.Time)
})
}

(trying to have something easier to read/that sticks to the fact that this is a sorting func)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah makes sense, I think I took the complicated way, I just updated as suggested but still kept in a seperate function to easily test, Let me know if I need to make it inplace and enhance the existing test itself.

104 changes: 104 additions & 0 deletions internal/controllers/machineset/machineset_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package machineset

import (
"fmt"
"sort"
"testing"
"time"

Expand Down Expand Up @@ -2774,3 +2775,106 @@ func TestNewMachineUpToDateCondition(t *testing.T) {
})
}
}

func TestGetMachinesToRemediateInOrder(t *testing.T) {
unhealthyMachinesWithAnnotations := []*clusterv1.Machine{}
for i := range 2 {
unhealthyMachinesWithAnnotations = append(unhealthyMachinesWithAnnotations, &clusterv1.Machine{
ObjectMeta: metav1.ObjectMeta{
Name: fmt.Sprintf("unhealthy-anotated-machine-%d", i),
Karthik-K-N marked this conversation as resolved.
Show resolved Hide resolved
Namespace: "default",
CreationTimestamp: metav1.Time{Time: metav1.Now().Add(time.Duration(i) * time.Second)},
Annotations: map[string]string{
clusterv1.RemediateMachineAnnotation: "",
},
},
Status: clusterv1.MachineStatus{
Conditions: []clusterv1.Condition{
{
Type: clusterv1.MachineOwnerRemediatedCondition,
Status: corev1.ConditionFalse,
},
{
Type: clusterv1.MachineHealthCheckSucceededCondition,
Status: corev1.ConditionFalse,
},
},
V1Beta2: &clusterv1.MachineV1Beta2Status{
Conditions: []metav1.Condition{
{
Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: clusterv1.MachineOwnerRemediatedWaitingForRemediationV1Beta2Reason,
Message: "Waiting for remediation",
},
{
Type: clusterv1.MachineHealthCheckSucceededV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: clusterv1.MachineHealthCheckHasRemediateAnnotationV1Beta2Reason,
Message: "Marked for remediation via cluster.x-k8s.io/remediate-machine annotation",
},
},
},
},
})
}

unhealthyMachines := []*clusterv1.Machine{}
for i := range 4 {
unhealthyMachines = append(unhealthyMachines, &clusterv1.Machine{
ObjectMeta: metav1.ObjectMeta{
Name: fmt.Sprintf("unhealthy-machine-%d", i),
Namespace: "default",
CreationTimestamp: metav1.Time{Time: metav1.Now().Add(time.Duration(i) * time.Second)},
},
Status: clusterv1.MachineStatus{
Conditions: []clusterv1.Condition{
{
Type: clusterv1.MachineOwnerRemediatedCondition,
Status: corev1.ConditionFalse,
},
{
Type: clusterv1.MachineHealthCheckSucceededCondition,
Status: corev1.ConditionFalse,
},
},
V1Beta2: &clusterv1.MachineV1Beta2Status{
Conditions: []metav1.Condition{
{
Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: clusterv1.MachineOwnerRemediatedWaitingForRemediationV1Beta2Reason,
Message: "Waiting for remediation",
},
{
Type: clusterv1.MachineHealthCheckSucceededV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: clusterv1.MachineHealthCheckHasRemediateAnnotationV1Beta2Reason,
Message: "Marked for remediation via cluster.x-k8s.io/remediate-machine annotation",
},
},
},
},
})
}

t.Run("remediation machines should be sorted with newest first", func(t *testing.T) {
g := NewWithT(t)
machines := unhealthyMachines
machinesToRemediate := getMachinesToRemediateInOrder(machines)
sort.SliceStable(machines, func(i, j int) bool {
return machines[i].CreationTimestamp.After(machines[j].CreationTimestamp.Time)
})
g.Expect(machinesToRemediate).To(Equal(machines))
})

t.Run("remediation machines with annotation should be prioritised over other machines", func(t *testing.T) {
g := NewWithT(t)
machines := append(unhealthyMachines, unhealthyMachinesWithAnnotations...)
machinesToRemediate := getMachinesToRemediateInOrder(machines)
sort.SliceStable(unhealthyMachines, func(i, j int) bool {
return unhealthyMachines[i].CreationTimestamp.After(unhealthyMachines[j].CreationTimestamp.Time)
})
g.Expect(machinesToRemediate).To(Equal(append(unhealthyMachinesWithAnnotations, unhealthyMachines...)))
})
}
Loading