diff --git a/apis/cluster/v1beta1/membercluster_types.go b/apis/cluster/v1beta1/membercluster_types.go index 3680c681a..853ada9c1 100644 --- a/apis/cluster/v1beta1/membercluster_types.go +++ b/apis/cluster/v1beta1/membercluster_types.go @@ -244,6 +244,17 @@ func (m *MemberCluster) GetAgentStatus(agentType AgentType) *AgentStatus { return nil } +// SetAgentStatus is used to set the agentStatus for a given agentType. +func (m *MemberCluster) SetAgentStatus(agentType AgentType, status AgentStatus) { + for i, s := range m.Status.AgentStatus { + if s.Type == agentType { + m.Status.AgentStatus[i] = status + return + } + } + m.Status.AgentStatus = append(m.Status.AgentStatus, status) +} + // GetAgentCondition queries the conditions in an agent status for a specific condition type. func (m *MemberCluster) GetAgentCondition(agentType AgentType, conditionType AgentConditionType) *metav1.Condition { if s := m.GetAgentStatus(agentType); s != nil { diff --git a/pkg/controllers/membercluster/v1beta1/membercluster_controller.go b/pkg/controllers/membercluster/v1beta1/membercluster_controller.go index 0a3e99de6..dbdcea44f 100644 --- a/pkg/controllers/membercluster/v1beta1/membercluster_controller.go +++ b/pkg/controllers/membercluster/v1beta1/membercluster_controller.go @@ -515,25 +515,46 @@ func (r *Reconciler) syncInternalMemberClusterStatus(imc *clusterv1beta1.Interna } // TODO: We didn't handle condition type: clusterv1beta1.ConditionTypeMemberClusterHealthy. - // Copy Agent status and set ObservedGeneration for agent conditions. - if len(imc.Status.AgentStatus) > 0 { - mc.Status.AgentStatus = make([]clusterv1beta1.AgentStatus, len(imc.Status.AgentStatus)) - } + latestAgentStatus := make([]clusterv1beta1.AgentStatus, 0, len(imc.Status.AgentStatus)) + var memberAgentStatusUpdated bool for i := range imc.Status.AgentStatus { - mc.Status.AgentStatus[i] = *imc.Status.AgentStatus[i].DeepCopy() - // Set ObservedGeneration for each agent condition. - for j := range mc.Status.AgentStatus[i].Conditions { - mc.Status.AgentStatus[i].Conditions[j].ObservedGeneration = mc.GetGeneration() + if imc.Status.AgentStatus[i].Conditions == nil { + err := controller.NewUnexpectedBehaviorError(fmt.Errorf("find an unexpected agent status for member cluster %s", mc.Name)) + klog.ErrorS(controller.NewUnexpectedBehaviorError(err), "Skipping invalid agent status", "memberCluster", klog.KObj(mc), "agentStatus", imc.Status.AgentStatus[i].Type) + continue // skip this invalid agent status + } + + if imc.Status.AgentStatus[i].Conditions[0].ObservedGeneration < imc.GetGeneration() { // assuming the agent status conditions are always updated together + klog.V(2).InfoS("Skipping stale agent status", "memberCluster", klog.KObj(mc), "agentType", imc.Status.AgentStatus[i].Type) + continue // skip stale agent status } + agentType := imc.Status.AgentStatus[i].Type + if agentType == clusterv1beta1.MemberAgent { + memberAgentStatusUpdated = true + } + + mcAgentStatus := *imc.Status.AgentStatus[i].DeepCopy() + for j := range mcAgentStatus.Conditions { + mcAgentStatus.Conditions[j].ObservedGeneration = mc.GetGeneration() // using the mc generation + } + latestAgentStatus = append(latestAgentStatus, mcAgentStatus) + mc.SetAgentStatus(agentType, mcAgentStatus) + } + + r.aggregateJoinedCondition(latestAgentStatus, mc) + if !memberAgentStatusUpdated { + klog.V(2).InfoS("Member agent status not found in internal member cluster status, skip updating member cluster status related fields", "memberCluster", klog.KObj(mc)) + return } - r.aggregateJoinedCondition(mc) + // The remaining fields are only updated when member-agent is updated. // Copy resource usages. mc.Status.ResourceUsage = imc.Status.ResourceUsage // Copy additional conditions. + // Right now all the additional conditions are reported by the member-agent. for idx := range imc.Status.Conditions { cond := imc.Status.Conditions[idx] - cond.ObservedGeneration = mc.GetGeneration() + cond.ObservedGeneration = mc.GetGeneration() // using the mc generation meta.SetStatusCondition(&mc.Status.Conditions, cond) } // Copy the cluster properties. @@ -564,32 +585,33 @@ func (r *Reconciler) updateMemberClusterStatus(ctx context.Context, mc *clusterv } // aggregateJoinedCondition is used to calculate and mark the joined or left status for member cluster based on join conditions from all agents. -func (r *Reconciler) aggregateJoinedCondition(mc *clusterv1beta1.MemberCluster) { +// agentStatus contains the latest status reported by the agents based on the latest mc spec. +func (r *Reconciler) aggregateJoinedCondition(agentStatus []clusterv1beta1.AgentStatus, mc *clusterv1beta1.MemberCluster) { klog.V(4).InfoS("Aggregate joined condition from all agents", "memberCluster", klog.KObj(mc)) var unknownMessage string - if len(mc.Status.AgentStatus) < len(r.agents) { - unknownMessage = fmt.Sprintf("Member cluster %s has not reported all the expected agents, expected %d, got %d", mc.Name, len(r.agents), len(mc.Status.AgentStatus)) + if len(agentStatus) < len(r.agents) { + unknownMessage = fmt.Sprintf("Member cluster %s has not reported all the expected agents, expected %d, got %d", mc.Name, len(r.agents), len(agentStatus)) markMemberClusterUnknown(r.recorder, mc, unknownMessage) return } joined := true left := true reportedAgents := make(map[clusterv1beta1.AgentType]bool) - for _, agentStatus := range mc.Status.AgentStatus { - if !r.agents[agentStatus.Type] { - _ = controller.NewUnexpectedBehaviorError(fmt.Errorf("find an unexpected agent type %s for member cluster %s", agentStatus.Type, mc.Name)) + for _, status := range agentStatus { + if !r.agents[status.Type] { + _ = controller.NewUnexpectedBehaviorError(fmt.Errorf("find an unexpected agent type %s for member cluster %s", status.Type, mc.Name)) continue // ignore any unexpected agent type } - condition := meta.FindStatusCondition(agentStatus.Conditions, string(clusterv1beta1.AgentJoined)) + condition := meta.FindStatusCondition(status.Conditions, string(clusterv1beta1.AgentJoined)) if condition == nil { - unknownMessage = fmt.Sprintf("Member cluster %s has not reported the join condition for agent %s", mc.Name, agentStatus.Type) + unknownMessage = fmt.Sprintf("Member cluster %s has not reported the join condition for agent %s", mc.Name, status.Type) markMemberClusterUnknown(r.recorder, mc, unknownMessage) return } joined = joined && condition.Status == metav1.ConditionTrue left = left && condition.Status == metav1.ConditionFalse - reportedAgents[agentStatus.Type] = true + reportedAgents[status.Type] = true } if len(reportedAgents) < len(r.agents) { diff --git a/pkg/controllers/membercluster/v1beta1/membercluster_controller_test.go b/pkg/controllers/membercluster/v1beta1/membercluster_controller_test.go index 31c1dd249..9be28d6fb 100644 --- a/pkg/controllers/membercluster/v1beta1/membercluster_controller_test.go +++ b/pkg/controllers/membercluster/v1beta1/membercluster_controller_test.go @@ -723,6 +723,9 @@ func TestSyncInternalMemberClusterStatus(t *testing.T) { }, }, internalMemberCluster: &clusterv1beta1.InternalMemberCluster{ + ObjectMeta: metav1.ObjectMeta{ + Generation: imcObservedGeneration, + }, Status: clusterv1beta1.InternalMemberClusterStatus{ Conditions: []metav1.Condition{ { @@ -885,6 +888,9 @@ func TestSyncInternalMemberClusterStatus(t *testing.T) { }, }, internalMemberCluster: &clusterv1beta1.InternalMemberCluster{ + ObjectMeta: metav1.ObjectMeta{ + Generation: imcObservedGeneration, + }, Status: clusterv1beta1.InternalMemberClusterStatus{ ResourceUsage: clusterv1beta1.ResourceUsage{ Capacity: corev1.ResourceList{ @@ -991,6 +997,9 @@ func TestSyncInternalMemberClusterStatus(t *testing.T) { }, }, internalMemberCluster: &clusterv1beta1.InternalMemberCluster{ + ObjectMeta: metav1.ObjectMeta{ + Generation: imcObservedGeneration, + }, Status: clusterv1beta1.InternalMemberClusterStatus{ ResourceUsage: clusterv1beta1.ResourceUsage{ Capacity: corev1.ResourceList{ @@ -1109,15 +1118,7 @@ func TestSyncInternalMemberClusterStatus(t *testing.T) { }, wantedMemberCluster: &clusterv1beta1.MemberCluster{ Status: clusterv1beta1.MemberClusterStatus{ - ResourceUsage: clusterv1beta1.ResourceUsage{ - Capacity: corev1.ResourceList{ - corev1.ResourceCPU: resource.MustParse("100m"), - }, - Allocatable: corev1.ResourceList{ - corev1.ResourceMemory: resource.MustParse("1Gi"), - }, - ObservationTime: now, - }, + // Resource usage and properties should not be copied when no member agent status is found Conditions: []metav1.Condition{ { Type: string(clusterv1beta1.ConditionTypeMemberClusterJoined), @@ -1149,6 +1150,9 @@ func TestSyncInternalMemberClusterStatus(t *testing.T) { }, }, internalMemberCluster: &clusterv1beta1.InternalMemberCluster{ + ObjectMeta: metav1.ObjectMeta{ + Generation: imcObservedGeneration, + }, Status: clusterv1beta1.InternalMemberClusterStatus{ ResourceUsage: clusterv1beta1.ResourceUsage{ Capacity: corev1.ResourceList{ @@ -1273,6 +1277,9 @@ func TestSyncInternalMemberClusterStatus(t *testing.T) { }, }, internalMemberCluster: &clusterv1beta1.InternalMemberCluster{ + ObjectMeta: metav1.ObjectMeta{ + Generation: imcObservedGeneration, + }, Status: clusterv1beta1.InternalMemberClusterStatus{ ResourceUsage: clusterv1beta1.ResourceUsage{ Capacity: corev1.ResourceList{ @@ -1340,7 +1347,7 @@ func TestSyncInternalMemberClusterStatus(t *testing.T) { }, }, }, - "condition is not reported in the status": { + "invalid agent status withi nil condition should be ignored": { r: &Reconciler{ recorder: utils.NewFakeRecorder(1), agents: map[clusterv1beta1.AgentType]bool{ @@ -1349,6 +1356,9 @@ func TestSyncInternalMemberClusterStatus(t *testing.T) { }, }, internalMemberCluster: &clusterv1beta1.InternalMemberCluster{ + ObjectMeta: metav1.ObjectMeta{ + Generation: imcObservedGeneration, + }, Status: clusterv1beta1.InternalMemberClusterStatus{ ResourceUsage: clusterv1beta1.ResourceUsage{ Capacity: corev1.ResourceList{ @@ -1416,15 +1426,184 @@ func TestSyncInternalMemberClusterStatus(t *testing.T) { }, LastReceivedHeartbeat: now, }, + // ServiceExportImportAgent is excluded because it has nil conditions. + }, + }, + }, + }, + "stale member-agent status should be skipped": { + r: &Reconciler{ + recorder: utils.NewFakeRecorder(1), + agents: map[clusterv1beta1.AgentType]bool{ + clusterv1beta1.MemberAgent: true, + clusterv1beta1.ServiceExportImportAgent: true, + }, + }, + internalMemberCluster: &clusterv1beta1.InternalMemberCluster{ + ObjectMeta: metav1.ObjectMeta{ + Generation: imcObservedGeneration, + }, + Status: clusterv1beta1.InternalMemberClusterStatus{ + ResourceUsage: clusterv1beta1.ResourceUsage{ + Capacity: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("100m"), + }, + Allocatable: corev1.ResourceList{ + corev1.ResourceMemory: resource.MustParse("1Gi"), + }, + ObservationTime: now, + }, + AgentStatus: []clusterv1beta1.AgentStatus{ { - Type: clusterv1beta1.ServiceExportImportAgent, + Type: clusterv1beta1.MemberAgent, + Conditions: []metav1.Condition{ + { + Type: string(clusterv1beta1.AgentJoined), + Status: metav1.ConditionTrue, + Reason: "Joined", + ObservedGeneration: imcObservedGeneration - 1, // stale + }, + }, + LastReceivedHeartbeat: metav1.Time{Time: now.Time.Add(-5 * time.Second)}, + }, + { + Type: clusterv1beta1.ServiceExportImportAgent, + Conditions: []metav1.Condition{ + { + Type: string(clusterv1beta1.AgentJoined), + Status: metav1.ConditionTrue, + Reason: "Joined", + ObservedGeneration: imcObservedGeneration, + }, + }, + LastReceivedHeartbeat: now, + }, + }, + }, + }, + memberCluster: &clusterv1beta1.MemberCluster{ + ObjectMeta: metav1.ObjectMeta{ + Generation: mcObservedGeneration, + }, + Status: clusterv1beta1.MemberClusterStatus{ + Conditions: []metav1.Condition{ + { + Type: string(clusterv1beta1.ConditionTypeMemberClusterJoined), + Status: metav1.ConditionTrue, + Reason: reasonMemberClusterJoined, + ObservedGeneration: mcObservedGeneration - 1, // stale + }, + { + Type: propertyProviderConditionType1, + Status: propertyProviderConditionStatus1, + Reason: propertyProviderConditionReason1, + ObservedGeneration: mcObservedGeneration - 1, // stale + }, + { + Type: propertyProviderConditionType2, + Status: propertyProviderConditionStatus2, + Reason: propertyProviderConditionReason2, + ObservedGeneration: mcObservedGeneration - 1, // stale + }, + }, + ResourceUsage: clusterv1beta1.ResourceUsage{ + Capacity: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("10m"), + }, + Allocatable: corev1.ResourceList{ + corev1.ResourceMemory: resource.MustParse("2Gi"), + }, + ObservationTime: metav1.Time{Time: now.Time.Add(-5 * time.Second)}, + }, + AgentStatus: []clusterv1beta1.AgentStatus{ + { + Type: clusterv1beta1.MemberAgent, + Conditions: []metav1.Condition{ + { + Type: string(clusterv1beta1.AgentJoined), + Status: metav1.ConditionTrue, + Reason: "Joined", + ObservedGeneration: mcObservedGeneration - 1, // stale + }, + }, + LastReceivedHeartbeat: metav1.Time{Time: now.Time.Add(-5 * time.Second)}, + }, + { + Type: clusterv1beta1.ServiceExportImportAgent, + Conditions: []metav1.Condition{ + { + Type: string(clusterv1beta1.AgentJoined), + Status: metav1.ConditionTrue, + Reason: "Joined", + ObservedGeneration: mcObservedGeneration - 1, // stale + }, + }, + LastReceivedHeartbeat: metav1.Time{Time: now.Time.Add(-5 * time.Second)}, + }, + }, + }, + }, + wantedMemberCluster: &clusterv1beta1.MemberCluster{ + Status: clusterv1beta1.MemberClusterStatus{ + Conditions: []metav1.Condition{ + { + Type: string(clusterv1beta1.ConditionTypeMemberClusterJoined), + Status: metav1.ConditionUnknown, + Reason: reasonMemberClusterUnknown, + ObservedGeneration: mcObservedGeneration, + }, + { + Type: propertyProviderConditionType1, + Status: propertyProviderConditionStatus1, + Reason: propertyProviderConditionReason1, + ObservedGeneration: mcObservedGeneration - 1, // stale + }, + { + Type: propertyProviderConditionType2, + Status: propertyProviderConditionStatus2, + Reason: propertyProviderConditionReason2, + ObservedGeneration: mcObservedGeneration - 1, // stale + }, + }, + ResourceUsage: clusterv1beta1.ResourceUsage{ + Capacity: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("10m"), + }, + Allocatable: corev1.ResourceList{ + corev1.ResourceMemory: resource.MustParse("2Gi"), + }, + ObservationTime: metav1.Time{Time: now.Time.Add(-5 * time.Second)}, + }, + AgentStatus: []clusterv1beta1.AgentStatus{ + { + Type: clusterv1beta1.MemberAgent, + Conditions: []metav1.Condition{ + { + Type: string(clusterv1beta1.AgentJoined), + Status: metav1.ConditionTrue, + Reason: "Joined", + ObservedGeneration: mcObservedGeneration - 1, // stale + }, + }, + LastReceivedHeartbeat: metav1.Time{Time: now.Time.Add(-5 * time.Second)}, + }, + { + Type: clusterv1beta1.ServiceExportImportAgent, + Conditions: []metav1.Condition{ + { + Type: string(clusterv1beta1.AgentJoined), + Status: metav1.ConditionTrue, + Reason: "Joined", + ObservedGeneration: mcObservedGeneration, + }, + }, LastReceivedHeartbeat: now, }, }, }, }, }, - "agent type is not reported in the status": { + "stale non member-agent status should be skipped": { r: &Reconciler{ recorder: utils.NewFakeRecorder(1), agents: map[clusterv1beta1.AgentType]bool{ @@ -1433,7 +1612,26 @@ func TestSyncInternalMemberClusterStatus(t *testing.T) { }, }, internalMemberCluster: &clusterv1beta1.InternalMemberCluster{ + ObjectMeta: metav1.ObjectMeta{ + Generation: imcObservedGeneration, + }, Status: clusterv1beta1.InternalMemberClusterStatus{ + Conditions: []metav1.Condition{ + { + Type: propertyProviderConditionType1, + Status: propertyProviderConditionStatus1, + Reason: propertyProviderConditionReason1, + ObservedGeneration: imcObservedGeneration, + LastTransitionTime: now, + }, + { + Type: propertyProviderConditionType2, + Status: propertyProviderConditionStatus2, + Reason: propertyProviderConditionReason2, + ObservedGeneration: imcObservedGeneration, + LastTransitionTime: now, + }, + }, ResourceUsage: clusterv1beta1.ResourceUsage{ Capacity: corev1.ResourceList{ corev1.ResourceCPU: resource.MustParse("100m"), @@ -1457,7 +1655,15 @@ func TestSyncInternalMemberClusterStatus(t *testing.T) { LastReceivedHeartbeat: now, }, { - Type: clusterv1beta1.MultiClusterServiceAgent, + Type: clusterv1beta1.ServiceExportImportAgent, + Conditions: []metav1.Condition{ + { + Type: string(clusterv1beta1.AgentJoined), + Status: metav1.ConditionTrue, + Reason: "Joined", + ObservedGeneration: imcObservedGeneration - 1, // stale + }, + }, LastReceivedHeartbeat: now, }, }, @@ -1467,6 +1673,63 @@ func TestSyncInternalMemberClusterStatus(t *testing.T) { ObjectMeta: metav1.ObjectMeta{ Generation: mcObservedGeneration, }, + Status: clusterv1beta1.MemberClusterStatus{ + Conditions: []metav1.Condition{ + { + Type: string(clusterv1beta1.ConditionTypeMemberClusterJoined), + Status: metav1.ConditionTrue, + Reason: reasonMemberClusterJoined, + ObservedGeneration: mcObservedGeneration - 1, // stale + }, + { + Type: propertyProviderConditionType1, + Status: propertyProviderConditionStatus1, + Reason: propertyProviderConditionReason1, + ObservedGeneration: mcObservedGeneration - 1, // stale + }, + { + Type: propertyProviderConditionType2, + Status: propertyProviderConditionStatus2, + Reason: propertyProviderConditionReason2, + ObservedGeneration: mcObservedGeneration - 1, // stale + }, + }, + ResourceUsage: clusterv1beta1.ResourceUsage{ + Capacity: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("10m"), + }, + Allocatable: corev1.ResourceList{ + corev1.ResourceMemory: resource.MustParse("2Gi"), + }, + ObservationTime: metav1.Time{Time: now.Time.Add(-5 * time.Second)}, + }, + AgentStatus: []clusterv1beta1.AgentStatus{ + { + Type: clusterv1beta1.MemberAgent, + Conditions: []metav1.Condition{ + { + Type: string(clusterv1beta1.AgentJoined), + Status: metav1.ConditionTrue, + Reason: "Joined", + ObservedGeneration: mcObservedGeneration - 1, // stale + }, + }, + LastReceivedHeartbeat: metav1.Time{Time: now.Time.Add(-5 * time.Second)}, + }, + { + Type: clusterv1beta1.ServiceExportImportAgent, + Conditions: []metav1.Condition{ + { + Type: string(clusterv1beta1.AgentJoined), + Status: metav1.ConditionTrue, + Reason: "Joined", + ObservedGeneration: mcObservedGeneration - 1, // stale + }, + }, + LastReceivedHeartbeat: metav1.Time{Time: now.Time.Add(-5 * time.Second)}, + }, + }, + }, }, wantedMemberCluster: &clusterv1beta1.MemberCluster{ Status: clusterv1beta1.MemberClusterStatus{ @@ -1477,6 +1740,18 @@ func TestSyncInternalMemberClusterStatus(t *testing.T) { Reason: reasonMemberClusterUnknown, ObservedGeneration: mcObservedGeneration, }, + { + Type: propertyProviderConditionType1, + Status: propertyProviderConditionStatus1, + Reason: propertyProviderConditionReason1, + ObservedGeneration: mcObservedGeneration, + }, + { + Type: propertyProviderConditionType2, + Status: propertyProviderConditionStatus2, + Reason: propertyProviderConditionReason2, + ObservedGeneration: mcObservedGeneration, + }, }, ResourceUsage: clusterv1beta1.ResourceUsage{ Capacity: corev1.ResourceList{ @@ -1501,8 +1776,16 @@ func TestSyncInternalMemberClusterStatus(t *testing.T) { LastReceivedHeartbeat: now, }, { - Type: clusterv1beta1.MultiClusterServiceAgent, - LastReceivedHeartbeat: now, + Type: clusterv1beta1.ServiceExportImportAgent, + Conditions: []metav1.Condition{ + { + Type: string(clusterv1beta1.AgentJoined), + Status: metav1.ConditionTrue, + Reason: "Joined", + ObservedGeneration: mcObservedGeneration - 1, // stale + }, + }, + LastReceivedHeartbeat: metav1.Time{Time: now.Time.Add(-5 * time.Second)}, }, }, }, diff --git a/test/e2e/utils_test.go b/test/e2e/utils_test.go index b847ceb91..62f247a4b 100644 --- a/test/e2e/utils_test.go +++ b/test/e2e/utils_test.go @@ -129,7 +129,7 @@ func markMemberClusterAsHealthy(name string) { { Type: string(clusterv1beta1.AgentJoined), LastTransitionTime: metav1.Now(), - ObservedGeneration: 0, + ObservedGeneration: imcObj.GetGeneration(), Status: metav1.ConditionTrue, Reason: "JoinedCluster", Message: "set to be joined", @@ -137,7 +137,7 @@ func markMemberClusterAsHealthy(name string) { { Type: string(clusterv1beta1.AgentHealthy), LastTransitionTime: metav1.Now(), - ObservedGeneration: 0, + ObservedGeneration: imcObj.GetGeneration(), Status: metav1.ConditionTrue, Reason: "HealthyCluster", Message: "set to be healthy",