Skip to content

Commit 74bac7e

Browse files
committed
MGMT-23738: Add RetryOnConflict to CreateAgentCR to fix flaky agent move between infraenvs
1 parent babc11d commit 74bac7e

2 files changed

Lines changed: 91 additions & 12 deletions

File tree

internal/controller/controllers/crd_utils.go

Lines changed: 25 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import (
1313
k8serrors "k8s.io/apimachinery/pkg/api/errors"
1414
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1515
"k8s.io/apimachinery/pkg/types"
16+
"k8s.io/client-go/util/retry"
1617
"sigs.k8s.io/controller-runtime/pkg/client"
1718
)
1819

@@ -137,6 +138,26 @@ func (u *CRDUtils) CreateAgentCR(ctx context.Context, log logrus.FieldLogger, ho
137138
}
138139
}
139140

141+
if err := u.updateAgentCR(ctx, namespacedName, labels, infraEnvCR, cluster); err != nil {
142+
return err
143+
}
144+
// Update 'kube_key_namespace' in Host
145+
if err := u.hostApi.UpdateKubeKeyNS(ctx, hostId, infraEnvCR.Namespace); err != nil {
146+
return errors.Wrapf(err, "Failed to update 'kube_key_namespace' in host %s", hostId)
147+
}
148+
log.Infof("Updated Agent CR. Namespace: %s, Cluster: %s, HostID: %s", infraEnvCR.Namespace, clusterName, hostId)
149+
}
150+
151+
return nil
152+
}
153+
154+
func (u *CRDUtils) updateAgentCR(ctx context.Context, namespacedName types.NamespacedName, labels map[string]string, infraEnvCR *aiv1beta1.InfraEnv, cluster *common.Cluster) error {
155+
return retry.RetryOnConflict(retry.DefaultBackoff, func() error {
156+
current := &aiv1beta1.Agent{}
157+
if err := u.client.Get(ctx, namespacedName, current); err != nil {
158+
return err
159+
}
160+
140161
updatedhost := &aiv1beta1.Agent{
141162
Spec: aiv1beta1.AgentSpec{
142163
Approved: false,
@@ -148,10 +169,10 @@ func (u *CRDUtils) CreateAgentCR(ctx context.Context, log logrus.FieldLogger, ho
148169
IgnitionConfigOverrides: "",
149170
},
150171
ObjectMeta: metav1.ObjectMeta{
151-
Name: hostId,
152-
Namespace: infraEnvCR.Namespace,
172+
Name: namespacedName.Name,
173+
Namespace: namespacedName.Namespace,
153174
Labels: labels,
154-
ResourceVersion: host.ResourceVersion,
175+
ResourceVersion: current.ResourceVersion,
155176
OwnerReferences: []metav1.OwnerReference{
156177
{
157178
APIVersion: infraEnvCR.APIVersion,
@@ -162,22 +183,14 @@ func (u *CRDUtils) CreateAgentCR(ctx context.Context, log logrus.FieldLogger, ho
162183
},
163184
},
164185
}
165-
166-
// Update 'kube_key_namespace' in Host
167-
if err1 := u.hostApi.UpdateKubeKeyNS(ctx, hostId, infraEnvCR.Namespace); err1 != nil {
168-
return errors.Wrapf(err, "Failed to update 'kube_key_namespace' in host %s", hostId)
169-
}
170186
if cluster != nil && cluster.KubeKeyNamespace != "" {
171187
updatedhost.Spec.ClusterDeploymentName = &aiv1beta1.ClusterReference{
172188
Name: cluster.KubeKeyName,
173189
Namespace: cluster.KubeKeyNamespace,
174190
}
175191
}
176-
log.Infof("Updating Agent CR. Namespace: %s, Cluster: %s, HostID: %s", infraEnvCR.Namespace, clusterName, hostId)
177192
return u.client.Update(ctx, updatedhost)
178-
}
179-
180-
return nil
193+
})
181194
}
182195

183196
type DummyCRDUtils struct{}

internal/controller/controllers/crd_utils_test.go

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package controllers
22

33
import (
44
"context"
5+
"fmt"
56

67
strfmt "github.com/go-openapi/strfmt"
78
"github.com/google/uuid"
@@ -13,10 +14,13 @@ import (
1314
"github.com/openshift/assisted-service/models"
1415
hivev1 "github.com/openshift/hive/apis/hive/v1"
1516
"go.uber.org/mock/gomock"
17+
k8serrors "k8s.io/apimachinery/pkg/api/errors"
18+
"k8s.io/apimachinery/pkg/runtime/schema"
1619
"k8s.io/apimachinery/pkg/types"
1720
"k8s.io/client-go/kubernetes/scheme"
1821
"sigs.k8s.io/controller-runtime/pkg/client"
1922
fakeclient "sigs.k8s.io/controller-runtime/pkg/client/fake"
23+
"sigs.k8s.io/controller-runtime/pkg/client/interceptor"
2024
)
2125

2226
var _ = Describe("create agent CR", func() {
@@ -176,6 +180,68 @@ var _ = Describe("create agent CR", func() {
176180
Expect(err).NotTo(HaveOccurred())
177181
})
178182

183+
It("Already existing agent update retries on conflict", func() {
184+
clusterDeployment := newClusterDeployment(clusterName, testNamespace, defaultClusterSpec)
185+
Expect(c.Create(ctx, clusterDeployment)).ShouldNot(HaveOccurred())
186+
infraEnvImage := newInfraEnvImage(infraEnvName, infraEnvNamespace, v1beta1.InfraEnvSpec{
187+
ClusterRef: &v1beta1.ClusterReference{
188+
Name: clusterDeployment.Name,
189+
Namespace: clusterDeployment.Namespace,
190+
},
191+
})
192+
Expect(c.Create(ctx, infraEnvImage)).ShouldNot(HaveOccurred())
193+
194+
hostId := uuid.New().String()
195+
agent := newAgent(hostId, infraEnvNamespace, v1beta1.AgentSpec{})
196+
Expect(c.Create(ctx, agent)).ShouldNot(HaveOccurred())
197+
198+
id := strfmt.UUID(hostId)
199+
otherClusterId := strfmt.UUID(uuid.New().String())
200+
infraEnvId2 := strfmt.UUID(uuid.New().String())
201+
h := common.Host{
202+
Host: models.Host{
203+
ID: &id,
204+
ClusterID: &otherClusterId,
205+
InfraEnvID: infraEnvId,
206+
},
207+
}
208+
infraEnv2 := &common.InfraEnv{
209+
KubeKeyNamespace: infraEnvNamespace,
210+
InfraEnv: models.InfraEnv{
211+
ID: &infraEnvId2,
212+
Name: &infraEnvName,
213+
},
214+
}
215+
216+
mockHostApi.EXPECT().GetHostByKubeKey(gomock.Any()).Return(&h, nil).Times(1)
217+
mockHostApi.EXPECT().UnRegisterHost(ctx, gomock.Any()).Return(nil).Times(1)
218+
mockHostApi.EXPECT().UpdateKubeKeyNS(gomock.Any(), gomock.Any(), gomock.Any()).Return(nil).Times(1)
219+
220+
updateCount := 0
221+
conflictClient := fakeclient.NewClientBuilder().
222+
WithScheme(scheme.Scheme).
223+
WithObjects(clusterDeployment, infraEnvImage, agent).
224+
WithInterceptorFuncs(interceptor.Funcs{
225+
Update: func(ctx context.Context, cl client.WithWatch, obj client.Object, opts ...client.UpdateOption) error {
226+
updateCount++
227+
if updateCount == 1 {
228+
return k8serrors.NewConflict(
229+
schema.GroupResource{Group: "agent-install.openshift.io", Resource: "agents"},
230+
obj.GetName(),
231+
fmt.Errorf("the object has been modified"),
232+
)
233+
}
234+
return cl.Update(ctx, obj, opts...)
235+
},
236+
}).
237+
Build()
238+
239+
conflictCrdUtils := NewCRDUtils(conflictClient, mockHostApi)
240+
err := conflictCrdUtils.CreateAgentCR(ctx, log, hostId, infraEnv2, cluster)
241+
Expect(err).NotTo(HaveOccurred())
242+
Expect(updateCount).To(Equal(2))
243+
})
244+
179245
It("Already existing agent different infraenv same namespace", func() {
180246
clusterDeployment := newClusterDeployment(clusterName, testNamespace, defaultClusterSpec)
181247
Expect(c.Create(ctx, clusterDeployment)).ShouldNot(HaveOccurred())

0 commit comments

Comments
 (0)