Browse Source

Merge pull request #11530 from VannTen/ci/cleanup_with_k8s_gc

[CI] Use Kubernetes GC to clean kubevirt VMs (packet-* jobs)
pull/11714/head
Kubernetes Prow Robot 1 week ago
committed by GitHub
parent
commit
05e2b47db6
No known key found for this signature in database GPG Key ID: B5690EEEBB952194
18 changed files with 152 additions and 352 deletions
  1. 7
      .gitlab-ci.yml
  2. 8
      .gitlab-ci/packet.yml
  3. 23
      tests/Makefile
  4. 8
      tests/cloud_playbooks/cleanup-packet.yml
  5. 3
      tests/cloud_playbooks/create-packet.yml
  6. 11
      tests/cloud_playbooks/delete-packet.yml
  7. 16
      tests/cloud_playbooks/roles/cleanup-packet-ci/tasks/main.yml
  8. 17
      tests/cloud_playbooks/roles/packet-ci/tasks/cleanup-old-vms.yml
  9. 50
      tests/cloud_playbooks/roles/packet-ci/tasks/create-vms.yml
  10. 30
      tests/cloud_playbooks/roles/packet-ci/tasks/delete-vms.yml
  11. 59
      tests/cloud_playbooks/roles/packet-ci/tasks/main.yml
  12. 98
      tests/cloud_playbooks/roles/packet-ci/templates/inventory.j2
  13. 97
      tests/cloud_playbooks/roles/packet-ci/templates/vm.yml.j2
  14. 44
      tests/cloud_playbooks/roles/packet-ci/vars/main.yml
  15. 5
      tests/common/_kubespray_test_settings.yml
  16. 10
      tests/common_vars.yml
  17. 2
      tests/scripts/testcases_cleanup.sh
  18. 16
      tests/scripts/testcases_run.sh

7
.gitlab-ci.yml

@ -11,10 +11,6 @@ variables:
GITLAB_REPOSITORY: 'kargo-ci/kubernetes-sigs-kubespray'
ANSIBLE_FORCE_COLOR: "true"
MAGIC: "ci check this"
TEST_ID: "$CI_PIPELINE_ID-$CI_JOB_ID"
CI_TEST_VARS: "./tests/files/${CI_JOB_NAME}.yml"
CI_TEST_REGISTRY_MIRROR: "./tests/common/_docker_hub_registry_mirror.yml"
CI_TEST_SETTING: "./tests/common/_kubespray_test_settings.yml"
GS_ACCESS_KEY_ID: $GS_KEY
GS_SECRET_ACCESS_KEY: $GS_SECRET
CONTAINER_ENGINE: docker
@ -22,13 +18,12 @@ variables:
GCE_PREEMPTIBLE: "false"
ANSIBLE_KEEP_REMOTE_FILES: "1"
ANSIBLE_CONFIG: ./tests/ansible.cfg
ANSIBLE_INVENTORY: ./inventory/sample/${CI_JOB_NAME}-${BUILD_NUMBER}.ini
IDEMPOT_CHECK: "false"
RESET_CHECK: "false"
REMOVE_NODE_CHECK: "false"
UPGRADE_TEST: "false"
MITOGEN_ENABLE: "false"
ANSIBLE_LOG_LEVEL: "-vv"
ANSIBLE_VERBOSITY: 2
RECOVER_CONTROL_PLANE_TEST: "false"
RECOVER_CONTROL_PLANE_TEST_GROUPS: "etcd[2:]:kube_control_plane[1:]"
TERRAFORM_VERSION: 1.3.7

8
.gitlab-ci/packet.yml

@ -65,14 +65,6 @@
allow_failure: true
extends: .packet
packet_cleanup_old:
stage: deploy-part1
extends: .packet_periodic
script:
- cd tests
- make cleanup-packet
after_script: []
# The ubuntu20-calico-all-in-one jobs are meant as early stages to prevent running the full CI if something is horribly broken
packet_ubuntu20-calico-all-in-one:
stage: deploy-part1

23
tests/Makefile

@ -1,5 +1,3 @@
INVENTORY=$(PWD)/../inventory/sample/${CI_JOB_NAME}-${BUILD_NUMBER}.ini
init-packet:
mkdir -p $(HOME)/.ssh
echo $(PACKET_VM_SSH_PRIVATE_KEY) | base64 -d > $(HOME)/.ssh/id_rsa
@ -13,30 +11,17 @@ delete-tf:
create-packet: init-packet
ansible-playbook cloud_playbooks/create-packet.yml -c local \
$(ANSIBLE_LOG_LEVEL) \
-e @"files/${CI_JOB_NAME}.yml" \
-e test_id=$(TEST_ID) \
-e branch="$(CI_COMMIT_BRANCH)" \
-e pipeline_id="$(CI_PIPELINE_ID)" \
-e inventory_path=$(INVENTORY)
delete-packet:
ansible-playbook cloud_playbooks/delete-packet.yml -c local \
$(ANSIBLE_LOG_LEVEL) \
-e @"files/${CI_JOB_NAME}.yml" \
-e test_id=$(TEST_ID) \
-e test_name="$(subst .,-,$(CI_PIPELINE_ID)-$(CI_JOB_ID))" \
-e branch="$(CI_COMMIT_BRANCH)" \
-e pipeline_id="$(CI_PIPELINE_ID)" \
-e inventory_path=$(INVENTORY)
-e inventory_path=$(INVENTORY_DIR)
cleanup-packet:
ansible-playbook cloud_playbooks/cleanup-packet.yml -c local \
$(ANSIBLE_LOG_LEVEL)
delete-packet: ;
create-vagrant:
vagrant up
cp $(CI_PROJECT_DIR)/.vagrant/provisioners/ansible/inventory/vagrant_ansible_inventory $(INVENTORY)
cp $(CI_PROJECT_DIR)/.vagrant/provisioners/ansible/inventory/vagrant_ansible_inventory $(INVENTORY_DIR)
delete-vagrant:
vagrant destroy -f

8
tests/cloud_playbooks/cleanup-packet.yml

@ -1,8 +0,0 @@
---
- name: Cleanup packet vms
hosts: localhost
gather_facts: false
become: true
roles:
- { role: cleanup-packet-ci }

3
tests/cloud_playbooks/create-packet.yml

@ -4,8 +4,5 @@
hosts: localhost
gather_facts: false
become: true
vars:
ci_job_name: "{{ lookup('env', 'CI_JOB_NAME') }}"
test_name: "{{ test_id | regex_replace('\\.', '-') }}"
roles:
- { role: packet-ci, vm_cleanup: false }

11
tests/cloud_playbooks/delete-packet.yml

@ -1,11 +0,0 @@
---
- name: Terminate Packet VMs
hosts: localhost
gather_facts: false
become: true
vars:
ci_job_name: "{{ lookup('env', 'CI_JOB_NAME') }}"
test_name: "{{ test_id | regex_replace('\\.', '-') }}"
roles:
- { role: packet-ci, vm_cleanup: true }

16
tests/cloud_playbooks/roles/cleanup-packet-ci/tasks/main.yml

@ -1,16 +0,0 @@
---
- name: Fetch a list of namespaces
kubernetes.core.k8s_info:
api_version: v1
kind: Namespace
label_selectors:
- cijobs = true
register: namespaces
- name: Delete stale namespaces for more than 2 hours
command: "kubectl delete namespace {{ item.metadata.name }}"
failed_when: false
loop: "{{ namespaces.resources }}"
when:
- (now() - (item.metadata.creationTimestamp | to_datetime("%Y-%m-%dT%H:%M:%SZ"))).total_seconds() >= 7200

17
tests/cloud_playbooks/roles/packet-ci/tasks/cleanup-old-vms.yml

@ -1,17 +0,0 @@
---
- name: Fetch a list of namespaces
kubernetes.core.k8s_info:
api_version: v1
kind: Namespace
label_selectors:
- cijobs = true
- branch = {{ branch_name_sane }}
register: namespaces
- name: Delete older namespaces
command: "kubectl delete namespace {{ item.metadata.name }}"
failed_when: false
loop: "{{ namespaces.resources }}"
when:
- (item.metadata.labels.pipeline_id | int) < (pipeline_id | int)

50
tests/cloud_playbooks/roles/packet-ci/tasks/create-vms.yml

@ -1,50 +0,0 @@
---
- name: "Create CI namespace {{ test_name }} for test vms"
shell: |-
kubectl create namespace {{ test_name }} &&
kubectl label namespace {{ test_name }} cijobs=true branch="{{ branch_name_sane }}" pipeline_id="{{ pipeline_id }}"
changed_when: false
- name: "Create temp dir /tmp/{{ test_name }} for CI files"
file:
path: "/tmp/{{ test_name }}"
state: directory
mode: "0755"
- name: Template vm files for CI job
set_fact:
vms_files: "{{ vms_files + [lookup('ansible.builtin.template', 'vm.yml.j2') | from_yaml] }}"
vars:
vms_files: []
loop: "{{ range(1, vm_count | int + 1, 1) | list }}"
loop_control:
index_var: vm_id
- name: Start vms for CI job
kubernetes.core.k8s:
definition: "{{ item }}"
changed_when: false
loop: "{{ vms_files }}"
- name: Wait for vms to have ipaddress assigned
shell: "set -o pipefail && kubectl get vmis -n {{ test_name }} instance-{{ vm_id }} -o json | jq '.status.interfaces[].ipAddress' | tr -d '\"'"
args:
executable: /bin/bash
changed_when: false
register: vm_ips
loop: "{{ range(1, vm_count | int + 1, 1) | list }}"
loop_control:
index_var: vm_id
retries: 20
delay: 15
until:
- vm_ips.stdout | ansible.utils.ipaddr
- name: "Create inventory for CI test in file /tmp/{{ test_name }}/inventory"
template:
src: "inventory.j2"
dest: "{{ inventory_path }}"
mode: "0644"
vars:
vms: "{{ vm_ips }}"

30
tests/cloud_playbooks/roles/packet-ci/tasks/delete-vms.yml

@ -1,30 +0,0 @@
---
- name: Check if temp directory for {{ test_name }} exists
stat:
path: "/tmp/{{ test_name }}"
get_attributes: false
get_checksum: false
get_mime: false
register: temp_dir_details
- name: "Cleanup temp directory for {{ test_name }}"
file:
path: "/tmp/{{ test_name }}"
state: absent
- name: "Cleanup namespace for {{ test_name }}"
command: "kubectl delete namespace {{ test_name }}"
changed_when: false
- name: Wait for namespace {{ test_name }} to be fully deleted
command: kubectl get ns {{ test_name }}
register: delete_namespace
failed_when:
- delete_namespace.rc == 0
changed_when:
- delete_namespace.rc == 0
retries: 12
delay: 10
until:
- delete_namespace.rc != 0

59
tests/cloud_playbooks/roles/packet-ci/tasks/main.yml

@ -1,17 +1,52 @@
---
- name: "Include custom vars for ci job: {{ ci_job_name }}"
- name: Include custom vars for ci job
include_vars: "../files/{{ ci_job_name }}.yml"
- name: Cleamup old VMs
import_tasks: cleanup-old-vms.yml
- name: Start vms for CI job
vars:
# Workaround for compatibility when testing upgrades with old == before e9d406ed088d4291ef1d9018c170a4deed2bf928
# TODO: drop after 2.27.0
legacy_groups: "{{ (['kube_control_plane', 'kube_node', 'calico_rr'] | intersect(item) | length > 0) | ternary(['k8s_cluster'], []) }}"
tvars:
kubespray_groups: "{{ item + legacy_groups }}"
kubernetes.core.k8s:
definition: "{{ lookup('template', 'vm.yml.j2', template_vars=tvars) }}"
loop: "{{ scenarios[mode | d('default')] }}"
- name: Wait for vms to have IP addresses
kubernetes.core.k8s_info:
api_version: kubevirt.io/v1
kind: VirtualMachineInstance
label_selectors:
- "ci_job_id={{ ci_job_id }}"
namespace: "{{ pod_namespace }}"
register: vmis
until: vmis.resources
| map(attribute='status.interfaces.0')
| rejectattr('ipAddress', 'defined') == []
retries: 30
delay: 10
- name: Create VMs
import_tasks: create-vms.yml
when:
- not vm_cleanup
- name: Massage VirtualMachineInstance data into an Ansible inventory structure
vars:
ips: "{{ vmis.resources | map(attribute='status.interfaces.0.ipAddress') }}"
names: "{{ vmis.resources | map(attribute='metadata.name') }}"
_groups: "{{ vmis.resources | map(attribute='metadata.annotations.ansible_groups') | map('split', ',') }}"
hosts: "{{ ips | zip(_groups, names)
| map('zip', ['ansible_host', 'ansible_groups', 'k8s_vmi_name'])
| map('map', 'reverse') | map('community.general.dict') }}"
loop: "{{ hosts | map(attribute='ansible_groups') | flatten | unique }}"
set_fact:
ci_inventory: "{{ ci_inventory|d({}) | combine({
item: {
'hosts': hosts | selectattr('ansible_groups', 'contains', item)
| rekey_on_member('k8s_vmi_name')
}
})
}}"
- name: Delete VMs
import_tasks: delete-vms.yml
when:
- vm_cleanup | default(false)
- name: Create inventory for CI tests
copy:
content: "{{ ci_inventory | to_yaml }}"
dest: "{{ inventory_path }}/ci_inventory.yml"
mode: "0644"

98
tests/cloud_playbooks/roles/packet-ci/templates/inventory.j2

@ -1,98 +0,0 @@
[all]
{% for instance in vms.results %}
instance-{{ loop.index }} ansible_host={{instance.stdout}}
{% endfor %}
{% if mode == "separate" %}
[kube_control_plane]
instance-1
[kube_node]
instance-2
[etcd]
instance-3
{% elif mode == "ha" %}
[kube_control_plane]
instance-1
instance-2
[kube_node]
instance-3
[etcd]
instance-1
instance-2
instance-3
{% elif mode == "default" %}
[kube_control_plane]
instance-1
[kube_node]
instance-2
[etcd]
instance-1
{% elif mode == "all-in-one" %}
[kube_control_plane]
instance-1
[kube_node]
instance-1
[etcd]
instance-1
{% elif mode == "ha-recover" %}
[kube_control_plane]
instance-1
instance-2
[kube_node]
instance-3
[etcd]
instance-3
instance-1
instance-2
[broken_kube_control_plane]
instance-2
[broken_etcd]
instance-2 etcd_member_name=etcd3
{% elif mode == "ha-recover-noquorum" %}
[kube_control_plane]
instance-3
instance-1
instance-2
[kube_node]
instance-3
[etcd]
instance-3
instance-1
instance-2
[broken_kube_control_plane]
instance-1
instance-2
[broken_etcd]
instance-1 etcd_member_name=etcd2
instance-2 etcd_member_name=etcd3
{% elif mode == "node-etcd-client" %}
[kube_control_plane]
instance-1
[etcd]
instance-1
instance-2
instance-3
[kube_node]
instance-1
instance-2
instance-3
instance-4
{% endif %}

97
tests/cloud_playbooks/roles/packet-ci/templates/vm.yml.j2

@ -1,54 +1,59 @@
---
apiVersion: kubevirt.io/v1alpha3
kind: VirtualMachine
apiVersion: kubevirt.io/v1
kind: VirtualMachineInstance
metadata:
name: "instance-{{ vm_id }}"
namespace: "{{ test_name }}"
generateName: test-vm-
namespace: {{ pod_namespace }}
annotations:
kubespray.com/ci.template-path: "tests/cloud_playbooks/roles/packet-ci/templates/vm.yml.j2"
ansible_groups: "{{ kubespray_groups | join(',') }}"
# This does not use a dns prefix because dots are hard to escape with map(attribute=) in Jinja
labels:
kubevirt.io/os: {{ cloud_image }}
kubevirt.io/size: small
kubevirt.io/domain: "{{ test_name }}"
ci_job_id: "{{ ci_job_id }}"
ci_job_name: "{{ ci_job_name }}"
# leverage the Kubernetes GC for resources cleanup
ownerReferences:
- apiVersion: v1
kind: Pod
name: "{{ pod_name }}"
uid: "{{ pod_uid }}"
spec:
running: true
template:
metadata:
labels:
kubevirt.io/size: small
kubevirt.io/domain: "{{ test_name }}"
spec:
domain:
devices:
blockMultiQueue: true
disks:
- disk:
bus: virtio
name: containervolume
cache: writethrough
- disk:
bus: virtio
name: cloudinitvolume
interfaces:
- name: default
bridge: {}
cpu:
cores: {{ vm_cpu_cores }}
sockets: {{ vm_cpu_sockets }}
threads: {{ vm_cpu_threads }}
resources:
requests:
memory: "{{ vm_memory * memory_allocation_ratio }}Mi"
cpu: {{ vm_cpu_cores * cpu_allocation_ratio }}
limits:
memory: "{{ vm_memory }}Mi"
cpu: {{ vm_cpu_cores }}
networks:
domain:
devices:
blockMultiQueue: true
disks:
- disk:
bus: virtio
name: containervolume
cache: writethrough
- disk:
bus: virtio
name: cloudinitvolume
interfaces:
- name: default
pod: {}
terminationGracePeriodSeconds: 0
volumes:
- name: containervolume
containerDisk:
image: quay.io/kubespray/vm-{{ cloud_image }}
- name: cloudinitvolume
cloudInitNoCloud:
userDataBase64: {{ cloud_init[cloud_image] }}
bridge: {}
cpu:
cores: {{ vm_cpu_cores }}
sockets: {{ vm_cpu_sockets }}
threads: {{ vm_cpu_threads }}
resources:
requests:
memory: "{{ vm_memory * memory_allocation_ratio }}Mi"
cpu: {{ vm_cpu_cores * cpu_allocation_ratio }}
limits:
memory: "{{ vm_memory }}Mi"
cpu: {{ vm_cpu_cores }}
networks:
- name: default
pod: {}
terminationGracePeriodSeconds: 0
volumes:
- name: containervolume
containerDisk:
image: quay.io/kubespray/vm-{{ cloud_image }}
- name: cloudinitvolume
cloudInitNoCloud:
userDataBase64: {{ cloud_init[cloud_image] }}

44
tests/cloud_playbooks/roles/packet-ci/vars/main.yml

@ -1,11 +1,37 @@
---
_vm_count_dict:
separate: 3
ha: 3
ha-recover: 3
ha-recover-noquorum: 3
all-in-one: 1
node-etcd-client: 4
default: 2
# This is a list of nodes with groups for each scenario/cluster layouts
scenarios:
separate:
- ['kube_control_plane']
- ['kube_node']
- ['etcd']
ha:
- ['kube_control_plane', 'etcd']
- ['kube_control_plane', 'etcd']
- ['kube_node', 'etcd']
default:
- ['kube_control_plane', 'etcd']
- ['kube_node']
all-in-one:
- ['kube_control_plane', 'etcd', 'kube_node']
ha-recover:
- ['kube_control_plane', 'etcd']
- ['kube_control_plane', 'etcd', 'broken_kube_control_plane', 'broken_etcd']
- ['kube_node', 'etcd']
ha-recover-noquorum:
- ['kube_control_plane', 'etcd', 'broken_kube_control_plane', 'broken_etcd']
- ['kube_control_plane', 'etcd', 'broken_kube_control_plane', 'broken_etcd']
- ['kube_node', 'etcd']
node-etcd-client:
- ['kube_node', 'kube_control_plane', 'etcd']
- ['kube_node', 'etcd']
- ['kube_node', 'etcd']
- ['kube_node']
vm_count: "{{ _vm_count_dict[mode | d('default')] }}"
# Get pod metadata / CI vars from environment
ci_job_id: "{{ lookup('ansible.builtin.env', 'CI_JOB_ID', default=undefined) }}"
ci_job_name: "{{ lookup('ansible.builtin.env', 'CI_JOB_NAME', default=undefined) }}"
pod_name: "{{ lookup('ansible.builtin.env', 'POD_NAME', default=undefined) }}"
pod_uid: "{{ lookup('ansible.builtin.env', 'POD_UID', default=undefined) }}"
pod_namespace: "{{ lookup('ansible.builtin.env', 'POD_NAMESPACE', default=undefined) }}"

5
tests/common/_kubespray_test_settings.yml

@ -1,5 +0,0 @@
---
# Kubespray settings for tests
deploy_netchecker: true
dns_min_replicas: 1
unsafe_show_logs: true

tests/common/_docker_hub_registry_mirror.yml → tests/common_vars.yml

@ -1,4 +1,10 @@
---
# Kubespray settings for tests
deploy_netchecker: true
dns_min_replicas: 1
unsafe_show_logs: true
# Registry mirrors settings
docker_registry_mirrors:
- "https://mirror.gcr.io"
@ -34,7 +40,3 @@ nginx_image_repo: "{{ quay_image_repo }}/kubespray/nginx"
flannel_image_repo: "{{ quay_image_repo }}/kubespray/flannel"
flannel_init_image_repo: "{{ quay_image_repo }}/kubespray/flannel-cni-plugin"
# Kubespray settings for tests
deploy_netchecker: true
dns_min_replicas: 1

2
tests/scripts/testcases_cleanup.sh

@ -1,7 +1,7 @@
#!/bin/bash
set -euxo pipefail
cd tests && make delete-${CI_PLATFORM} -s ; cd -
make -C tests delete-${CI_PLATFORM} -s
if [ -d ~/.ara ] ; then
tar czvf ${CI_PROJECT_DIR}/cluster-dump/ara.tgz ~/.ara

16
tests/scripts/testcases_run.sh

@ -18,10 +18,9 @@ fi
# Check out latest tag if testing upgrade
if [ "${UPGRADE_TEST}" != "false" ]; then
git fetch --all && git checkout "$KUBESPRAY_VERSION"
# Checkout the CI vars file so it is available
git checkout "${CI_COMMIT_SHA}" tests/files/${CI_JOB_NAME}.yml
git checkout "${CI_COMMIT_SHA}" ${CI_TEST_REGISTRY_MIRROR}
git checkout "${CI_COMMIT_SHA}" ${CI_TEST_SETTING}
# Checkout the current tests/ directory ; even when testing old version,
# we want the up-to-date test setup/provisionning
git checkout "${CI_COMMIT_SHA}" -- tests/
fi
# needed for ara not to complain
@ -31,8 +30,9 @@ export ANSIBLE_REMOTE_USER=$SSH_USER
export ANSIBLE_BECOME=true
export ANSIBLE_BECOME_USER=root
export ANSIBLE_CALLBACK_PLUGINS="$(python -m ara.setup.callback_plugins)"
export ANSIBLE_INVENTORY=${CI_PROJECT_DIR}/inventory/sample/
cd tests && make create-${CI_PLATFORM} -s ; cd -
make -C tests INVENTORY_DIR=${ANSIBLE_INVENTORY} create-${CI_PLATFORM} -s
ansible-playbook tests/cloud_playbooks/wait-for-ssh.yml
# Flatcar Container Linux needs auto update disabled
@ -55,10 +55,8 @@ playbook=$1
shift
# We can set --limit here and still pass it as supplemental args because `--limit` is a 'last one wins' option
ansible-playbook \
$ANSIBLE_LOG_LEVEL \
-e @${CI_TEST_SETTING} \
-e @${CI_TEST_REGISTRY_MIRROR} \
-e @${CI_TEST_VARS} \
-e @tests/common_vars.yml \
-e @tests/files/${CI_JOB_NAME}.yml \
-e local_release_dir=${PWD}/downloads \
"$@" \
${playbook}

Loading…
Cancel
Save