From fd05c66b298afd19c8100634cf164d728493f5d4 Mon Sep 17 00:00:00 2001 From: sithembiso Date: Tue, 21 Oct 2025 15:48:39 +0200 Subject: [PATCH 1/5] Add better logging. --- pkg/node/node.go | 78 +++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 74 insertions(+), 4 deletions(-) diff --git a/pkg/node/node.go b/pkg/node/node.go index 2e1022bf..5daec3e6 100644 --- a/pkg/node/node.go +++ b/pkg/node/node.go @@ -427,26 +427,53 @@ func (n Node) GetNodeNameFromProviderID(providerId string) (string, error) { return "", nil } + log.Debug(). + Str("target_provider_id", providerId). + Msg("Looking up node by ProviderID") + listOptions := metav1.ListOptions{} nodes, err := n.drainHelper.Client.CoreV1().Nodes().List(context.TODO(), listOptions) if err != nil { log.Err(err).Msgf("Error when trying to list nodes to find node with ProviderID") - return "", err } + log.Debug(). + Int("total_nodes", len(nodes.Items)). + Str("looking_for", providerId). + Msg("Retrieved nodes from API") + for _, n := range nodes.Items { + log.Trace(). + Str("node_name", n.GetObjectMeta().GetName()). + Str("node_provider_id", n.Spec.ProviderID). + Str("comparing_to", providerId). + Bool("match", n.Spec.ProviderID == providerId). + Msg("Checking node") + if n.Spec.ProviderID == providerId { labels := n.GetObjectMeta().GetLabels() if hostname, ok := labels["kubernetes.io/hostname="]; ok { + log.Debug(). + Str("found_hostname", hostname). + Msg("Returning hostname from label") return hostname, nil } - return n.GetObjectMeta().GetName(), nil + nodeName := n.GetObjectMeta().GetName() + log.Debug(). + Str("found_node", nodeName). + Msg("Returning node name") + return nodeName, nil } } + log.Warn(). + Str("provider_id_not_found", providerId). + Int("nodes_checked", len(nodes.Items)). + Msg("Node with ProviderID was not found in the cluster") + return "", fmt.Errorf("Node with ProviderID '%s' was not found in the cluster", providerId) } @@ -652,7 +679,14 @@ func (n Node) fetchKubernetesNode(nodeName string) (*corev1.Node, error) { if n.nthConfig.DryRun { return node, nil } + shortNodeName := strings.Split(nodeName, ".")[0] + + log.Debug(). + Str("node_name", nodeName). + Str("short_node_name", shortNodeName). + Msg("Attempting to fetch Kubernetes node") + labelSelector := metav1.LabelSelector{ MatchExpressions: []metav1.LabelSelectorRequirement{ { @@ -663,11 +697,47 @@ func (n Node) fetchKubernetesNode(nodeName string) (*corev1.Node, error) { }, } listOptions := metav1.ListOptions{LabelSelector: metav1.FormatLabelSelector(&labelSelector)} + + log.Debug(). + Str("label_selector", metav1.FormatLabelSelector(&labelSelector)). + Msg("Listing nodes with label selector") + matchingNodes, err := n.drainHelper.Client.CoreV1().Nodes().List(context.TODO(), listOptions) + if err != nil || len(matchingNodes.Items) == 0 { - log.Warn().Msgf("Unable to list Nodes w/ label, falling back to direct Get lookup of node") - return n.drainHelper.Client.CoreV1().Nodes().Get(context.TODO(), nodeName, metav1.GetOptions{}) + if err != nil { + log.Warn(). + Err(err). + Str("node_name", nodeName). + Msg("Unable to list Nodes w/ label, falling back to direct Get lookup of node") + } else { + log.Warn(). + Str("node_name", nodeName). + Str("label_selector", metav1.FormatLabelSelector(&labelSelector)). + Int("matching_nodes", 0). + Msg("No nodes found with label selector, falling back to direct Get lookup of node") + } + + node, getErr := n.drainHelper.Client.CoreV1().Nodes().Get(context.TODO(), nodeName, metav1.GetOptions{}) + if getErr != nil { + log.Error(). + Err(getErr). + Str("node_name", nodeName). + Msg("Failed to get node directly") + } else { + log.Debug(). + Str("node_name", nodeName). + Msg("Successfully fetched node via direct Get") + } + return node, getErr } + + log.Debug(). + Str("node_name", nodeName). + Int("matching_nodes", len(matchingNodes.Items)). + Str("selected_node", matchingNodes.Items[0].Name). + Msg("Found node(s) via label selector") + return &matchingNodes.Items[0], nil } From 12f307ae6f0709cc1eddd930319f6de99eeb0985 Mon Sep 17 00:00:00 2001 From: sithembiso Date: Tue, 21 Oct 2025 15:59:39 +0200 Subject: [PATCH 2/5] Switch to debug mode. --- pkg/node/node.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/node/node.go b/pkg/node/node.go index 5daec3e6..a3150f13 100644 --- a/pkg/node/node.go +++ b/pkg/node/node.go @@ -444,7 +444,7 @@ func (n Node) GetNodeNameFromProviderID(providerId string) (string, error) { Msg("Retrieved nodes from API") for _, n := range nodes.Items { - log.Trace(). + log.Debug(). Str("node_name", n.GetObjectMeta().GetName()). Str("node_provider_id", n.Spec.ProviderID). Str("comparing_to", providerId). From 47cdf2822a2aabf35f97f2afd84e387b2c415618 Mon Sep 17 00:00:00 2001 From: sithembiso Date: Wed, 22 Oct 2025 10:42:19 +0200 Subject: [PATCH 3/5] Uses the resolved node name for tainting operations. If UseProviderId enabled. --- pkg/monitor/sqsevent/asg-lifecycle-event.go | 13 ++++++++++++- .../sqsevent/rebalance-recommendation-event.go | 14 +++++++++++++- pkg/monitor/sqsevent/scheduled-change-event.go | 14 +++++++++++++- pkg/monitor/sqsevent/spot-itn-event.go | 14 +++++++++++++- pkg/node/node.go | 1 + 5 files changed, 52 insertions(+), 4 deletions(-) diff --git a/pkg/monitor/sqsevent/asg-lifecycle-event.go b/pkg/monitor/sqsevent/asg-lifecycle-event.go index 6fd4d852..9716ba02 100644 --- a/pkg/monitor/sqsevent/asg-lifecycle-event.go +++ b/pkg/monitor/sqsevent/asg-lifecycle-event.go @@ -126,7 +126,18 @@ func (m SQSMonitor) asgTerminationToInterruptionEvent(event *EventBridgeEvent, m go m.SendHeartbeats(nthConfig.HeartbeatInterval, nthConfig.HeartbeatUntil, lifecycleDetail, stopHeartbeatCh, cancelHeartbeatCh) } - err := n.TaintASGLifecycleTermination(interruptionEvent.NodeName, interruptionEvent.EventID) + // Use provider ID to resolve the actual Kubernetes node name if UseProviderId is configured + nodeName := interruptionEvent.NodeName + if nthConfig.UseProviderId && interruptionEvent.ProviderID != "" { + resolvedNodeName, err := n.GetNodeNameFromProviderID(interruptionEvent.ProviderID) + if err != nil { + log.Warn().Err(err).Str("provider_id", interruptionEvent.ProviderID).Msg("Failed to resolve node name from provider ID, falling back to NodeName from event") + } else { + nodeName = resolvedNodeName + } + } + + err := n.TaintASGLifecycleTermination(nodeName, interruptionEvent.EventID) if err != nil { log.Err(err).Msgf("unable to taint node with taint %s:%s", node.ASGLifecycleTerminationTaint, interruptionEvent.EventID) } diff --git a/pkg/monitor/sqsevent/rebalance-recommendation-event.go b/pkg/monitor/sqsevent/rebalance-recommendation-event.go index 0ecc30fb..baa0aad2 100644 --- a/pkg/monitor/sqsevent/rebalance-recommendation-event.go +++ b/pkg/monitor/sqsevent/rebalance-recommendation-event.go @@ -78,7 +78,19 @@ func (m SQSMonitor) rebalanceRecommendationToInterruptionEvent(event *EventBridg return nil } interruptionEvent.PreDrainTask = func(interruptionEvent monitor.InterruptionEvent, n node.Node) error { - err := n.TaintRebalanceRecommendation(interruptionEvent.NodeName, interruptionEvent.EventID) + // Use provider ID to resolve the actual Kubernetes node name if UseProviderId is configured + nthConfig := n.GetNthConfig() + nodeName := interruptionEvent.NodeName + if nthConfig.UseProviderId && interruptionEvent.ProviderID != "" { + resolvedNodeName, err := n.GetNodeNameFromProviderID(interruptionEvent.ProviderID) + if err != nil { + log.Warn().Err(err).Str("provider_id", interruptionEvent.ProviderID).Msg("Failed to resolve node name from provider ID, falling back to NodeName from event") + } else { + nodeName = resolvedNodeName + } + } + + err := n.TaintRebalanceRecommendation(nodeName, interruptionEvent.EventID) if err != nil { log.Err(err).Msgf("Unable to taint node with taint %s:%s", node.RebalanceRecommendationTaint, interruptionEvent.EventID) } diff --git a/pkg/monitor/sqsevent/scheduled-change-event.go b/pkg/monitor/sqsevent/scheduled-change-event.go index 1fe68cda..f688b9d5 100644 --- a/pkg/monitor/sqsevent/scheduled-change-event.go +++ b/pkg/monitor/sqsevent/scheduled-change-event.go @@ -113,7 +113,19 @@ func (m SQSMonitor) scheduledEventToInterruptionEvents(event *EventBridgeEvent, return nil } interruptionEvent.PreDrainTask = func(interruptionEvent monitor.InterruptionEvent, n node.Node) error { - if err := n.TaintScheduledMaintenance(interruptionEvent.NodeName, interruptionEvent.EventID); err != nil { + // Use provider ID to resolve the actual Kubernetes node name if UseProviderId is configured + nthConfig := n.GetNthConfig() + nodeName := interruptionEvent.NodeName + if nthConfig.UseProviderId && interruptionEvent.ProviderID != "" { + resolvedNodeName, err := n.GetNodeNameFromProviderID(interruptionEvent.ProviderID) + if err != nil { + log.Warn().Err(err).Str("provider_id", interruptionEvent.ProviderID).Msg("Failed to resolve node name from provider ID, falling back to NodeName from event") + } else { + nodeName = resolvedNodeName + } + } + + if err := n.TaintScheduledMaintenance(nodeName, interruptionEvent.EventID); err != nil { log.Err(err).Msgf("Unable to taint node with taint %s:%s", node.ScheduledMaintenanceTaint, interruptionEvent.EventID) } return nil diff --git a/pkg/monitor/sqsevent/spot-itn-event.go b/pkg/monitor/sqsevent/spot-itn-event.go index d0aa476e..a5f21e1e 100644 --- a/pkg/monitor/sqsevent/spot-itn-event.go +++ b/pkg/monitor/sqsevent/spot-itn-event.go @@ -80,7 +80,19 @@ func (m SQSMonitor) spotITNTerminationToInterruptionEvent(event *EventBridgeEven return nil } interruptionEvent.PreDrainTask = func(interruptionEvent monitor.InterruptionEvent, n node.Node) error { - err := n.TaintSpotItn(interruptionEvent.NodeName, interruptionEvent.EventID) + // Use provider ID to resolve the actual Kubernetes node name if UseProviderId is configured + nthConfig := n.GetNthConfig() + nodeName := interruptionEvent.NodeName + if nthConfig.UseProviderId && interruptionEvent.ProviderID != "" { + resolvedNodeName, err := n.GetNodeNameFromProviderID(interruptionEvent.ProviderID) + if err != nil { + log.Warn().Err(err).Str("provider_id", interruptionEvent.ProviderID).Msg("Failed to resolve node name from provider ID, falling back to NodeName from event") + } else { + nodeName = resolvedNodeName + } + } + + err := n.TaintSpotItn(nodeName, interruptionEvent.EventID) if err != nil { log.Err(err).Msgf("Unable to taint node with taint %s:%s", node.SpotInterruptionTaint, interruptionEvent.EventID) } diff --git a/pkg/node/node.go b/pkg/node/node.go index a3150f13..2aaa5923 100644 --- a/pkg/node/node.go +++ b/pkg/node/node.go @@ -696,6 +696,7 @@ func (n Node) fetchKubernetesNode(nodeName string) (*corev1.Node, error) { }, }, } + listOptions := metav1.ListOptions{LabelSelector: metav1.FormatLabelSelector(&labelSelector)} log.Debug(). From c5e01894cdd8ea8dc19344505352b1ed43c2b938 Mon Sep 17 00:00:00 2001 From: sithembiso Date: Wed, 22 Oct 2025 14:03:10 +0200 Subject: [PATCH 4/5] Revert "Switch to debug mode." This reverts commit 12f307ae6f0709cc1eddd930319f6de99eeb0985. --- pkg/node/node.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/node/node.go b/pkg/node/node.go index 2aaa5923..5269d0df 100644 --- a/pkg/node/node.go +++ b/pkg/node/node.go @@ -444,7 +444,7 @@ func (n Node) GetNodeNameFromProviderID(providerId string) (string, error) { Msg("Retrieved nodes from API") for _, n := range nodes.Items { - log.Debug(). + log.Trace(). Str("node_name", n.GetObjectMeta().GetName()). Str("node_provider_id", n.Spec.ProviderID). Str("comparing_to", providerId). From 3fed7a4f14af3cf49038d2b7e0c50fd7ed135f8f Mon Sep 17 00:00:00 2001 From: sithembiso Date: Wed, 22 Oct 2025 14:03:24 +0200 Subject: [PATCH 5/5] Revert "Add better logging." This reverts commit fd05c66b298afd19c8100634cf164d728493f5d4. --- pkg/node/node.go | 78 +++--------------------------------------------- 1 file changed, 4 insertions(+), 74 deletions(-) diff --git a/pkg/node/node.go b/pkg/node/node.go index 5269d0df..b80d6ebf 100644 --- a/pkg/node/node.go +++ b/pkg/node/node.go @@ -427,53 +427,26 @@ func (n Node) GetNodeNameFromProviderID(providerId string) (string, error) { return "", nil } - log.Debug(). - Str("target_provider_id", providerId). - Msg("Looking up node by ProviderID") - listOptions := metav1.ListOptions{} nodes, err := n.drainHelper.Client.CoreV1().Nodes().List(context.TODO(), listOptions) if err != nil { log.Err(err).Msgf("Error when trying to list nodes to find node with ProviderID") + return "", err } - log.Debug(). - Int("total_nodes", len(nodes.Items)). - Str("looking_for", providerId). - Msg("Retrieved nodes from API") - for _, n := range nodes.Items { - log.Trace(). - Str("node_name", n.GetObjectMeta().GetName()). - Str("node_provider_id", n.Spec.ProviderID). - Str("comparing_to", providerId). - Bool("match", n.Spec.ProviderID == providerId). - Msg("Checking node") - if n.Spec.ProviderID == providerId { labels := n.GetObjectMeta().GetLabels() if hostname, ok := labels["kubernetes.io/hostname="]; ok { - log.Debug(). - Str("found_hostname", hostname). - Msg("Returning hostname from label") return hostname, nil } - nodeName := n.GetObjectMeta().GetName() - log.Debug(). - Str("found_node", nodeName). - Msg("Returning node name") - return nodeName, nil + return n.GetObjectMeta().GetName(), nil } } - log.Warn(). - Str("provider_id_not_found", providerId). - Int("nodes_checked", len(nodes.Items)). - Msg("Node with ProviderID was not found in the cluster") - return "", fmt.Errorf("Node with ProviderID '%s' was not found in the cluster", providerId) } @@ -679,14 +652,7 @@ func (n Node) fetchKubernetesNode(nodeName string) (*corev1.Node, error) { if n.nthConfig.DryRun { return node, nil } - shortNodeName := strings.Split(nodeName, ".")[0] - - log.Debug(). - Str("node_name", nodeName). - Str("short_node_name", shortNodeName). - Msg("Attempting to fetch Kubernetes node") - labelSelector := metav1.LabelSelector{ MatchExpressions: []metav1.LabelSelectorRequirement{ { @@ -698,47 +664,11 @@ func (n Node) fetchKubernetesNode(nodeName string) (*corev1.Node, error) { } listOptions := metav1.ListOptions{LabelSelector: metav1.FormatLabelSelector(&labelSelector)} - - log.Debug(). - Str("label_selector", metav1.FormatLabelSelector(&labelSelector)). - Msg("Listing nodes with label selector") - matchingNodes, err := n.drainHelper.Client.CoreV1().Nodes().List(context.TODO(), listOptions) - if err != nil || len(matchingNodes.Items) == 0 { - if err != nil { - log.Warn(). - Err(err). - Str("node_name", nodeName). - Msg("Unable to list Nodes w/ label, falling back to direct Get lookup of node") - } else { - log.Warn(). - Str("node_name", nodeName). - Str("label_selector", metav1.FormatLabelSelector(&labelSelector)). - Int("matching_nodes", 0). - Msg("No nodes found with label selector, falling back to direct Get lookup of node") - } - - node, getErr := n.drainHelper.Client.CoreV1().Nodes().Get(context.TODO(), nodeName, metav1.GetOptions{}) - if getErr != nil { - log.Error(). - Err(getErr). - Str("node_name", nodeName). - Msg("Failed to get node directly") - } else { - log.Debug(). - Str("node_name", nodeName). - Msg("Successfully fetched node via direct Get") - } - return node, getErr + log.Warn().Msgf("Unable to list Nodes w/ label, falling back to direct Get lookup of node") + return n.drainHelper.Client.CoreV1().Nodes().Get(context.TODO(), nodeName, metav1.GetOptions{}) } - - log.Debug(). - Str("node_name", nodeName). - Int("matching_nodes", len(matchingNodes.Items)). - Str("selected_node", matchingNodes.Items[0].Name). - Msg("Found node(s) via label selector") - return &matchingNodes.Items[0], nil }