diff --git a/docs/cgroups.md b/docs/cgroups.md new file mode 100644 index 000000000..30ca7778e --- /dev/null +++ b/docs/cgroups.md @@ -0,0 +1,72 @@ +# cgroups + +To avoid the rivals for resources between containers or the impact on the host in Kubernetes, the kubelet components will rely on cgroups to limit the container’s resources usage. + +## Enforcing Node Allocatable + +You can use `kubelet_enforce_node_allocatable` to set node allocatable enforcement. + +```yaml +# A comma separated list of levels of node allocatable enforcement to be enforced by kubelet. +kubelet_enforce_node_allocatable: "pods" +# kubelet_enforce_node_allocatable: "pods,kube-reserved" +# kubelet_enforce_node_allocatable: "pods,kube-reserved,system-reserved" +``` + +Note that to enforce kube-reserved or system-reserved, `kube_reserved_cgroups` or `system_reserved_cgroups` needs to be specified respectively. + +Here is an example: + +```yaml +kubelet_enforce_node_allocatable: "pods,kube-reserved,system-reserved" + +# Reserve this space for kube resources +# Set to true to reserve resources for kube daemons +kube_reserved: true +kube_reserved_cgroups_for_service_slice: kube.slice +kube_reserved_cgroups: "/{{ kube_reserved_cgroups_for_service_slice }}" +kube_memory_reserved: 256Mi +kube_cpu_reserved: 100m +# kube_ephemeral_storage_reserved: 2Gi +# kube_pid_reserved: "1000" +# Reservation for master hosts +kube_master_memory_reserved: 512Mi +kube_master_cpu_reserved: 200m +# kube_master_ephemeral_storage_reserved: 2Gi +# kube_master_pid_reserved: "1000" + +# Set to true to reserve resources for system daemons +system_reserved: true +system_reserved_cgroups_for_service_slice: system.slice +system_reserved_cgroups: "/{{ system_reserved_cgroups_for_service_slice }}" +system_memory_reserved: 512Mi +system_cpu_reserved: 500m +# system_ephemeral_storage_reserved: 2Gi +# system_pid_reserved: "1000" +# Reservation for master hosts +system_master_memory_reserved: 256Mi +system_master_cpu_reserved: 250m +# system_master_ephemeral_storage_reserved: 2Gi +# system_master_pid_reserved: "1000" +``` + +After the setup, the cgroups hierarchy is as follows: + +```bash +/ (Cgroups Root) +├── kubepods.slice +│ ├── ... +│ ├── kubepods-besteffort.slice +│ ├── kubepods-burstable.slice +│ └── ... +├── kube.slice +│ ├── ... +│ ├── {{container_manager}}.service +│ ├── kubelet.service +│ └── ... +├── system.slice +│ └── ... +└── ... +``` + +You can learn more in the [official kubernetes documentation](https://kubernetes.io/docs/tasks/administer-cluster/reserve-compute-resources/). diff --git a/inventory/sample/group_vars/k8s_cluster/k8s-cluster.yml b/inventory/sample/group_vars/k8s_cluster/k8s-cluster.yml index b4c1de7dc..189157d59 100644 --- a/inventory/sample/group_vars/k8s_cluster/k8s-cluster.yml +++ b/inventory/sample/group_vars/k8s_cluster/k8s-cluster.yml @@ -261,9 +261,36 @@ podsecuritypolicy_enabled: false # Acceptable options are 'pods', 'system-reserved', 'kube-reserved' and ''. Default is "". # kubelet_enforce_node_allocatable: pods +## Set runtime and kubelet cgroups when using systemd as cgroup driver (default) +# kubelet_runtime_cgroups: "{{ kube_reserved_cgroups }}/{{ container_manager }}.service" +# kubelet_kubelet_cgroups: "{{ kube_reserved_cgroups }}/kubelet.service" + +## Set runtime and kubelet cgroups when using cgroupfs as cgroup driver +# kubelet_runtime_cgroups_cgroupfs: "/system.slice/{{ container_manager }}.service" +# kubelet_kubelet_cgroups_cgroupfs: "/system.slice/kubelet.service" + +# Optionally reserve this space for kube daemons. +# kube_reserved: true +## Uncomment to override default values +## The following two items need to be set when kube_reserved is true +# kube_reserved_cgroups_for_service_slice: kube.slice +# kube_reserved_cgroups: "/{{ kube_reserved_cgroups_for_service_slice }}" +# kube_memory_reserved: 256Mi +# kube_cpu_reserved: 100m +# kube_ephemeral_storage_reserved: 2Gi +# kube_pid_reserved: "1000" +# Reservation for master hosts +# kube_master_memory_reserved: 512Mi +# kube_master_cpu_reserved: 200m +# kube_master_ephemeral_storage_reserved: 2Gi +# kube_master_pid_reserved: "1000" + ## Optionally reserve resources for OS system daemons. # system_reserved: true ## Uncomment to override default values +## The following two items need to be set when system_reserved is true +# system_reserved_cgroups_for_service_slice: system.slice +# system_reserved_cgroups: "/{{ system_reserved_cgroups_for_service_slice }}" # system_memory_reserved: 512Mi # system_cpu_reserved: 500m # system_ephemeral_storage_reserved: 2Gi diff --git a/roles/container-engine/containerd/templates/containerd.service.j2 b/roles/container-engine/containerd/templates/containerd.service.j2 index adebcf218..06b229084 100644 --- a/roles/container-engine/containerd/templates/containerd.service.j2 +++ b/roles/container-engine/containerd/templates/containerd.service.j2 @@ -36,6 +36,10 @@ LimitMEMLOCK={{ containerd_limit_mem_lock }} # Only systemd 226 and above support this version. TasksMax=infinity OOMScoreAdjust=-999 +# Set the cgroup slice of the service so that kube reserved takes effect +{% if kube_reserved is defined and kube_reserved|bool %} +Slice={{ kube_reserved_cgroups_for_service_slice }} +{% endif %} [Install] WantedBy=multi-user.target diff --git a/roles/container-engine/cri-dockerd/templates/cri-dockerd.service.j2 b/roles/container-engine/cri-dockerd/templates/cri-dockerd.service.j2 index 078f66651..ec128150f 100644 --- a/roles/container-engine/cri-dockerd/templates/cri-dockerd.service.j2 +++ b/roles/container-engine/cri-dockerd/templates/cri-dockerd.service.j2 @@ -35,6 +35,10 @@ LimitCORE=infinity TasksMax=infinity Delegate=yes KillMode=process +# Set the cgroup slice of the service so that kube reserved takes effect +{% if kube_reserved is defined and kube_reserved|bool %} +Slice={{ kube_reserved_cgroups_for_service_slice }} +{% endif %} [Install] WantedBy=multi-user.target diff --git a/roles/container-engine/cri-o/templates/crio.conf.j2 b/roles/container-engine/cri-o/templates/crio.conf.j2 index 1a25e0929..d209b2bef 100644 --- a/roles/container-engine/cri-o/templates/crio.conf.j2 +++ b/roles/container-engine/cri-o/templates/crio.conf.j2 @@ -113,8 +113,12 @@ conmon = "{{ crio_conmon }}" {% if crio_cgroup_manager == "cgroupfs" %} conmon_cgroup = "pod" {% else %} +{% if kube_reserved is defined and kube_reserved|bool %} +conmon_cgroup = "{{ kube_reserved_cgroups_for_service_slice }} +{% else %} conmon_cgroup = "system.slice" {% endif %} +{% endif %} # Environment variable list for the conmon process, used for passing necessary # environment variables to conmon or the runtime. diff --git a/roles/container-engine/docker/templates/docker.service.j2 b/roles/container-engine/docker/templates/docker.service.j2 index fd1d06121..539c3a5c4 100644 --- a/roles/container-engine/docker/templates/docker.service.j2 +++ b/roles/container-engine/docker/templates/docker.service.j2 @@ -42,6 +42,10 @@ TimeoutStartSec=1min Restart=on-failure StartLimitBurst=3 StartLimitInterval=60s +# Set the cgroup slice of the service so that kube reserved takes effect +{% if kube_reserved is defined and kube_reserved|bool %} +Slice={{ kube_reserved_cgroups_for_service_slice }} +{% endif %} [Install] WantedBy=multi-user.target diff --git a/roles/kubernetes/node/defaults/main.yml b/roles/kubernetes/node/defaults/main.yml index 8be61744f..0c6b57b8b 100644 --- a/roles/kubernetes/node/defaults/main.yml +++ b/roles/kubernetes/node/defaults/main.yml @@ -12,11 +12,11 @@ kube_resolv_conf: "/etc/resolv.conf" kubelet_enforce_node_allocatable: "\"\"" # Set runtime and kubelet cgroups when using systemd as cgroup driver (default) -kubelet_runtime_cgroups: "/systemd/system.slice" -kubelet_kubelet_cgroups: "/systemd/system.slice" +kubelet_runtime_cgroups: "{{ kube_reserved_cgroups }}/{{ container_manager }}.service" +kubelet_kubelet_cgroups: "{{ kube_reserved_cgroups }}/kubelet.service" # Set runtime and kubelet cgroups when using cgroupfs as cgroup driver -kubelet_runtime_cgroups_cgroupfs: "/system.slice/containerd.service" +kubelet_runtime_cgroups_cgroupfs: "/system.slice/{{ container_manager }}.service" kubelet_kubelet_cgroups_cgroupfs: "/system.slice/kubelet.service" ### fail with swap on (default true) @@ -32,6 +32,10 @@ kubelet_secure_addresses: >- {%- endfor -%} # Reserve this space for kube resources +# Set to true to reserve resources for kube daemons +kube_reserved: false +kube_reserved_cgroups_for_service_slice: kube.slice +kube_reserved_cgroups: "/{{ kube_reserved_cgroups_for_service_slice }}" kube_memory_reserved: 256Mi kube_cpu_reserved: 100m # kube_ephemeral_storage_reserved: 2Gi @@ -44,6 +48,8 @@ kube_master_cpu_reserved: 200m # Set to true to reserve resources for system daemons system_reserved: false +system_reserved_cgroups_for_service_slice: system.slice +system_reserved_cgroups: "/{{ system_reserved_cgroups_for_service_slice }}" system_memory_reserved: 512Mi system_cpu_reserved: 500m # system_ephemeral_storage_reserved: 2Gi diff --git a/roles/kubernetes/node/templates/kubelet-config.v1beta1.yaml.j2 b/roles/kubernetes/node/templates/kubelet-config.v1beta1.yaml.j2 index 9982f62aa..885fc2ed7 100644 --- a/roles/kubernetes/node/templates/kubelet-config.v1beta1.yaml.j2 +++ b/roles/kubernetes/node/templates/kubelet-config.v1beta1.yaml.j2 @@ -60,6 +60,8 @@ clusterDNS: - {{ dns_address }} {% endfor %} {# Node reserved CPU/memory #} +{% if kube_reserved|bool %} +kubeReservedCgroup: {{ kube_reserved_cgroups }} kubeReserved: {% if is_kube_master|bool %} cpu: {{ kube_master_cpu_reserved }} @@ -80,7 +82,9 @@ kubeReserved: pid: "{{ kube_pid_reserved }}" {% endif %} {% endif %} -{% if system_reserved is defined and system_reserved %} +{% endif %} +{% if system_reserved|bool %} +systemReservedCgroup: {{ system_reserved_cgroups }} systemReserved: {% if is_kube_master|bool %} cpu: {{ system_master_cpu_reserved }} diff --git a/roles/kubernetes/node/templates/kubelet.service.j2 b/roles/kubernetes/node/templates/kubelet.service.j2 index feb837424..9df98e09e 100644 --- a/roles/kubernetes/node/templates/kubelet.service.j2 +++ b/roles/kubernetes/node/templates/kubelet.service.j2 @@ -10,6 +10,24 @@ Wants={{ container_manager }}.service [Service] EnvironmentFile=-{{ kube_config_dir }}/kubelet.env +{% if system_reserved|bool %} +ExecStartPre=/bin/mkdir -p /sys/fs/cgroup/cpu/{{ system_reserved_cgroups_for_service_slice }} +ExecStartPre=/bin/mkdir -p /sys/fs/cgroup/cpuacct/{{ system_reserved_cgroups_for_service_slice }} +ExecStartPre=/bin/mkdir -p /sys/fs/cgroup/cpuset/{{ system_reserved_cgroups_for_service_slice }} +ExecStartPre=/bin/mkdir -p /sys/fs/cgroup/hugetlb/{{ system_reserved_cgroups_for_service_slice }} +ExecStartPre=/bin/mkdir -p /sys/fs/cgroup/memory/{{ system_reserved_cgroups_for_service_slice }} +ExecStartPre=/bin/mkdir -p /sys/fs/cgroup/pids/{{ system_reserved_cgroups_for_service_slice }} +ExecStartPre=/bin/mkdir -p /sys/fs/cgroup/systemd/{{ system_reserved_cgroups_for_service_slice }} +{% endif %} +{% if kube_reserved|bool %} +ExecStartPre=/bin/mkdir -p /sys/fs/cgroup/cpu/{{ kube_reserved_cgroups_for_service_slice }} +ExecStartPre=/bin/mkdir -p /sys/fs/cgroup/cpuacct/{{ kube_reserved_cgroups_for_service_slice }} +ExecStartPre=/bin/mkdir -p /sys/fs/cgroup/cpuset/{{ kube_reserved_cgroups_for_service_slice }} +ExecStartPre=/bin/mkdir -p /sys/fs/cgroup/hugetlb/{{ kube_reserved_cgroups_for_service_slice }} +ExecStartPre=/bin/mkdir -p /sys/fs/cgroup/memory/{{ kube_reserved_cgroups_for_service_slice }} +ExecStartPre=/bin/mkdir -p /sys/fs/cgroup/pids/{{ kube_reserved_cgroups_for_service_slice }} +ExecStartPre=/bin/mkdir -p /sys/fs/cgroup/systemd/{{ kube_reserved_cgroups_for_service_slice }} +{% endif %} ExecStart={{ bin_dir }}/kubelet \ $KUBE_LOGTOSTDERR \ $KUBE_LOG_LEVEL \