Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🌱 Priorities machine with remediate-machine anotation when selecting the next machine to be remediated #11495

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion api/v1beta1/common_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ const (
// MachineSkipRemediationAnnotation is the annotation used to mark the machines that should not be considered for remediation by MachineHealthCheck reconciler.
MachineSkipRemediationAnnotation = "cluster.x-k8s.io/skip-remediation"

// RemediateMachineAnnotation is the annotation used to mark machines that should be remediated by MachineHealthCheck reconciler.
// RemediateMachineAnnotation request the MachineHealthCheck reconciler to mark a Machine as unhealthy. CAPI builtin remediation will prioritize Machines with the annotation to be remediated.
RemediateMachineAnnotation = "cluster.x-k8s.io/remediate-machine"

// MachineSetSkipPreflightChecksAnnotation is the annotation used to provide a comma-separated list of
Expand Down
11 changes: 9 additions & 2 deletions controlplane/kubeadm/internal/controllers/remediation.go
Original file line number Diff line number Diff line change
Expand Up @@ -336,9 +336,16 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C
return ctrl.Result{Requeue: true}, nil
}

// Gets the machine to be remediated, which is the oldest machine marked as unhealthy not yet provisioned (if any)
// or the oldest machine marked as unhealthy.
// Gets the machine which is marked as unhealthy and to be remediated, in the following order
// Oldest machine with RemediateMachineAnnotation annotation, if any
// Oldest machine not yet provisioned, if any
// Oldest machine marked as unhealthy.
func getMachineToBeRemediated(unhealthyMachines collections.Machines) *clusterv1.Machine {
annotatedMachine := unhealthyMachines.Filter(collections.HasAnnotationKey(clusterv1.RemediateMachineAnnotation)).Oldest()
Karthik-K-N marked this conversation as resolved.
Show resolved Hide resolved
if annotatedMachine != nil {
return annotatedMachine
}

machineToBeRemediated := unhealthyMachines.Filter(collections.Not(collections.HasNode())).Oldest()
if machineToBeRemediated == nil {
machineToBeRemediated = unhealthyMachines.Oldest()
Expand Down
43 changes: 43 additions & 0 deletions controlplane/kubeadm/internal/controllers/remediation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,49 @@ import (
)

func TestGetMachineToBeRemediated(t *testing.T) {
t.Run("returns the oldest machine with RemediateMachineAnnotation even if there are provisioning machines", func(t *testing.T) {
g := NewWithT(t)

ns, err := env.CreateNamespace(ctx, "ns1")
g.Expect(err).ToNot(HaveOccurred())
defer func() {
g.Expect(env.Cleanup(ctx, ns)).To(Succeed())
}()

m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed())
m2 := createMachine(ctx, g, ns.Name, "m2-unhealthy-", withMachineHealthCheckFailed())
m3 := createMachine(ctx, g, ns.Name, "m3-unhealthy-", withMachineHealthCheckFailed(), withoutNodeRef())
m4 := createMachine(ctx, g, ns.Name, "m4-unhealthy-", withMachineHealthCheckFailed(), withoutNodeRef())

m1.SetAnnotations(map[string]string{clusterv1.RemediateMachineAnnotation: ""})
m2.SetAnnotations(map[string]string{clusterv1.RemediateMachineAnnotation: ""})

unhealthyMachines := collections.FromMachines(m2, m1, m3, m4)

g.Expect(getMachineToBeRemediated(unhealthyMachines).Name).To(HavePrefix("m1-unhealthy-"))
})

t.Run("returns the oldest machine with RemediateMachineAnnotation if there are no provisioning machines", func(t *testing.T) {
g := NewWithT(t)

ns, err := env.CreateNamespace(ctx, "ns1")
g.Expect(err).ToNot(HaveOccurred())
defer func() {
g.Expect(env.Cleanup(ctx, ns)).To(Succeed())
}()

m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed())
m2 := createMachine(ctx, g, ns.Name, "m2-unhealthy-", withMachineHealthCheckFailed())
m3 := createMachine(ctx, g, ns.Name, "m3-unhealthy-", withMachineHealthCheckFailed())

m1.SetAnnotations(map[string]string{clusterv1.RemediateMachineAnnotation: ""})
m2.SetAnnotations(map[string]string{clusterv1.RemediateMachineAnnotation: ""})

unhealthyMachines := collections.FromMachines(m2, m1, m3)

g.Expect(getMachineToBeRemediated(unhealthyMachines).Name).To(HavePrefix("m1-unhealthy-"))
})

t.Run("returns the oldest machine if there are no provisioning machines", func(t *testing.T) {
g := NewWithT(t)

Expand Down
25 changes: 19 additions & 6 deletions internal/controllers/machineset/machineset_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -1404,12 +1404,7 @@ func (r *Reconciler) reconcileUnhealthyMachines(ctx context.Context, s *scope) (
return ctrl.Result{}, nil
}

// Sort the machines from newest to oldest.
// We are trying to remediate machines failing to come up first because
// there is a chance that they are not hosting any workloads (minimize disruption).
sort.SliceStable(machinesToRemediate, func(i, j int) bool {
return machinesToRemediate[i].CreationTimestamp.After(machinesToRemediate[j].CreationTimestamp.Time)
})
sortMachinesToRemediate(machinesToRemediate)

// Check if we should limit the in flight operations.
if len(machinesToRemediate) > maxInFlight {
Expand Down Expand Up @@ -1578,3 +1573,21 @@ func (r *Reconciler) reconcileExternalTemplateReference(ctx context.Context, clu

return false, patchHelper.Patch(ctx, obj)
}

// Returns the machines to be remediated in the following order
// - Machines with RemediateMachineAnnotation annotation if any,
// - Machines failing to come up first because
// there is a chance that they are not hosting any workloads (minimize disruption).
func sortMachinesToRemediate(machines []*clusterv1.Machine) {
sort.SliceStable(machines, func(i, j int) bool {
_, iHasRemediateAnnotation := machines[i].Annotations[clusterv1.RemediateMachineAnnotation]
_, jHasRemediateAnnotation := machines[j].Annotations[clusterv1.RemediateMachineAnnotation]
if iHasRemediateAnnotation && !jHasRemediateAnnotation {
return true
}
if !iHasRemediateAnnotation && jHasRemediateAnnotation {
return false
}
return machines[i].CreationTimestamp.After(machines[j].CreationTimestamp.Time)
})
}
108 changes: 108 additions & 0 deletions internal/controllers/machineset/machineset_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package machineset

import (
"fmt"
"sort"
"testing"
"time"

Expand Down Expand Up @@ -2774,3 +2775,110 @@ func TestNewMachineUpToDateCondition(t *testing.T) {
})
}
}

func TestSortMachinesToRemediate(t *testing.T) {
unhealthyMachinesWithAnnotations := []*clusterv1.Machine{}
for i := range 4 {
unhealthyMachinesWithAnnotations = append(unhealthyMachinesWithAnnotations, &clusterv1.Machine{
ObjectMeta: metav1.ObjectMeta{
Name: fmt.Sprintf("unhealthy-annotated-machine-%d", i),
Namespace: "default",
CreationTimestamp: metav1.Time{Time: metav1.Now().Add(time.Duration(i) * time.Second)},
Annotations: map[string]string{
clusterv1.RemediateMachineAnnotation: "",
},
},
Status: clusterv1.MachineStatus{
Conditions: []clusterv1.Condition{
{
Type: clusterv1.MachineOwnerRemediatedCondition,
Status: corev1.ConditionFalse,
},
{
Type: clusterv1.MachineHealthCheckSucceededCondition,
Status: corev1.ConditionFalse,
},
},
V1Beta2: &clusterv1.MachineV1Beta2Status{
Conditions: []metav1.Condition{
{
Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: clusterv1.MachineOwnerRemediatedWaitingForRemediationV1Beta2Reason,
Message: "Waiting for remediation",
},
{
Type: clusterv1.MachineHealthCheckSucceededV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: clusterv1.MachineHealthCheckHasRemediateAnnotationV1Beta2Reason,
Message: "Marked for remediation via cluster.x-k8s.io/remediate-machine annotation",
},
},
},
},
})
}

unhealthyMachines := []*clusterv1.Machine{}
for i := range 4 {
unhealthyMachines = append(unhealthyMachines, &clusterv1.Machine{
ObjectMeta: metav1.ObjectMeta{
Name: fmt.Sprintf("unhealthy-machine-%d", i),
Namespace: "default",
CreationTimestamp: metav1.Time{Time: metav1.Now().Add(time.Duration(i) * time.Second)},
},
Status: clusterv1.MachineStatus{
Conditions: []clusterv1.Condition{
{
Type: clusterv1.MachineOwnerRemediatedCondition,
Status: corev1.ConditionFalse,
},
{
Type: clusterv1.MachineHealthCheckSucceededCondition,
Status: corev1.ConditionFalse,
},
},
V1Beta2: &clusterv1.MachineV1Beta2Status{
Conditions: []metav1.Condition{
{
Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: clusterv1.MachineOwnerRemediatedWaitingForRemediationV1Beta2Reason,
Message: "Waiting for remediation",
},
{
Type: clusterv1.MachineHealthCheckSucceededV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: clusterv1.MachineHealthCheckHasRemediateAnnotationV1Beta2Reason,
Message: "Marked for remediation via cluster.x-k8s.io/remediate-machine annotation",
},
},
},
},
})
}

t.Run("remediation machines should be sorted with newest first", func(t *testing.T) {
g := NewWithT(t)
machines := unhealthyMachines
sortMachinesToRemediate(machines)
sort.SliceStable(unhealthyMachines, func(i, j int) bool {
return machines[i].CreationTimestamp.After(machines[j].CreationTimestamp.Time)
})
g.Expect(unhealthyMachines).To(Equal(machines))
})

t.Run("remediation machines with annotation should be prioritised over other machines", func(t *testing.T) {
g := NewWithT(t)
machines := append(unhealthyMachines, unhealthyMachinesWithAnnotations...)
sortMachinesToRemediate(machines)

sort.SliceStable(unhealthyMachines, func(i, j int) bool {
return unhealthyMachines[i].CreationTimestamp.After(unhealthyMachines[j].CreationTimestamp.Time)
})
sort.SliceStable(unhealthyMachinesWithAnnotations, func(i, j int) bool {
return unhealthyMachinesWithAnnotations[i].CreationTimestamp.After(unhealthyMachinesWithAnnotations[j].CreationTimestamp.Time)
})
g.Expect(machines).To(Equal(append(unhealthyMachinesWithAnnotations, unhealthyMachines...)))
})
}
Loading