CI: Use deployment instead of Pod for agnhost

This is a followup to 2ba28a338 (Revert "Wait for available API token in
a new namespace (#7045)", 2024-10-25).

While checking for the serviceaccount token is not effective, there is
still a race when creating a Pod directly, because the ServiceAccount
itself might not be created yet.
More details at https://github.com/kubernetes/kubernetes/issues/66689.

This cause very frequent flakes in our CI with spurious failures.

Use a Deployment instead ; it will takes cares of creating the Pods and
retrying ; it also let us use kubectl rollout status instead of manually
checking for the pods.
This commit is contained in:
Max Gautier
2024-12-12 14:18:04 +01:00
parent 74aee12305
commit 930df78d8a

View File

@@ -79,53 +79,47 @@
command: command:
cmd: "{{ bin_dir }}/kubectl apply -f -" cmd: "{{ bin_dir }}/kubectl apply -f -"
stdin: | stdin: |
apiVersion: v1 apiVersion: apps/v1
kind: Pod kind: Deployment
metadata: metadata:
name: {{ item }} name: agnhost
namespace: test
spec: spec:
containers: replicas: 2
- name: agnhost selector:
image: {{ test_image_repo }}:{{ test_image_tag }} matchLabels:
command: ['/agnhost', 'netexec', '--http-port=8080'] app: agnhost
securityContext: template:
allowPrivilegeEscalation: false metadata:
capabilities: labels:
drop: ['ALL'] app: agnhost
runAsUser: 1000 spec:
runAsNonRoot: true containers:
seccompProfile: - name: agnhost
type: RuntimeDefault image: {{ test_image_repo }}:{{ test_image_tag }}
command: ['/agnhost', 'netexec', '--http-port=8080']
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ['ALL']
runAsUser: 1000
runAsNonRoot: true
seccompProfile:
type: RuntimeDefault
changed_when: false changed_when: false
loop:
- agnhost1
- agnhost2
- import_role: # noqa name[missing] - import_role: # noqa name[missing]
name: cluster-dump name: cluster-dump
- name: Check that all pods are running and ready - name: Check that all pods are running and ready
command: "{{ bin_dir }}/kubectl get pods --namespace test --no-headers -o yaml" block:
changed_when: false - name: Check Deployment is ready
register: run_pods_log command: "{{ bin_dir }}/kubectl rollout status deploy --namespace test agnhost --timeout=180"
until: changed_when: false
# Check that all pods are running rescue:
- '(run_pods_log.stdout | from_yaml)["items"] | map(attribute = "status.phase") | unique | list == ["Running"]' - name: Get pod names
# Check that all pods are ready command: "{{ bin_dir }}/kubectl get pods -n test -o json"
- '(run_pods_log.stdout | from_yaml)["items"] | map(attribute = "status.containerStatuses") | map("map", attribute = "ready") | map("min") | min' changed_when: false
retries: 18 register: pods
delay: 10
failed_when: false
- name: Get pod names
command: "{{ bin_dir }}/kubectl get pods -n test -o json"
changed_when: false
register: pods
- debug: # noqa name[missing]
msg: "{{ pods.stdout.split('\n') }}"
failed_when: not run_pods_log is success
- name: Get hostnet pods - name: Get hostnet pods
command: "{{ bin_dir }}/kubectl get pods -n test -o command: "{{ bin_dir }}/kubectl get pods -n test -o