Welcome to the fifteenth post in our Kubernetes A-to-Z Series! Now that you understand troubleshooting, let’s explore Quality Assurance - testing strategies and chaos engineering practices that ensure your Kubernetes applications are reliable and resilient.
Testing Pyramid for Kubernetes
┌─────────────────────────────────────────────────┐
│ Testing Pyramid │
│ │
│ /\ │
│ / \ E2E Tests │
│ / \ (Few, Slow, Expensive) │
│ /──────\ │
│ / \ Integration Tests │
│ / \ (Medium) │
│ /────────────\ │
│ / \ Unit Tests │
│ / \ (Many, Fast, Cheap) │
│ /──────────────────\ │
└─────────────────────────────────────────────────┘
Unit Testing Kubernetes Manifests
Validating YAML with kubeconform
# Install kubeconform
brew install kubeconform
# Validate manifests
kubeconform -summary deployment.yaml
kubeconform -strict -kubernetes-version 1.28.0 ./manifests/
# Validate Helm templates
helm template mychart | kubeconform -summary
Policy Testing with Conftest
# Install conftest
brew install conftest
# Create policy
mkdir policy
# policy/deployment.rego
package main
deny[msg] {
input.kind == "Deployment"
not input.spec.template.spec.securityContext.runAsNonRoot
msg = "Containers must run as non-root"
}
deny[msg] {
input.kind == "Deployment"
container := input.spec.template.spec.containers[_]
not container.resources.limits
msg = sprintf("Container %s must have resource limits", [container.name])
}
deny[msg] {
input.kind == "Deployment"
container := input.spec.template.spec.containers[_]
container.securityContext.privileged
msg = sprintf("Container %s must not be privileged", [container.name])
}
# Run policy tests
conftest test deployment.yaml
conftest test --policy policy/ ./manifests/
Helm Chart Testing
# charts/webapp/tests/deployment_test.yaml
suite: deployment tests
templates:
- deployment.yaml
tests:
- it: should set correct replicas
set:
replicaCount: 5
asserts:
- equal:
path: spec.replicas
value: 5
- it: should set resource limits
asserts:
- isNotNull:
path: spec.template.spec.containers[0].resources.limits
- it: should use correct image
set:
image.repository: myapp
image.tag: v1.0.0
asserts:
- equal:
path: spec.template.spec.containers[0].image
value: myapp:v1.0.0
# Install helm-unittest plugin
helm plugin install https://github.com/helm-unittest/helm-unittest
# Run tests
helm unittest ./charts/webapp
Integration Testing
Testing with Kind
# Create test cluster
kind create cluster --name test-cluster
# Deploy application
kubectl apply -f manifests/
# Run integration tests
go test ./integration/... -v
# Cleanup
kind delete cluster --name test-cluster
Integration Test Example
// integration/deployment_test.go
package integration
import (
"context"
"testing"
"time"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/tools/clientcmd"
)
func TestDeploymentCreation(t *testing.T) {
config, err := clientcmd.BuildConfigFromFlags("", clientcmd.RecommendedHomeFile)
if err != nil {
t.Fatal(err)
}
clientset, err := kubernetes.NewForConfig(config)
if err != nil {
t.Fatal(err)
}
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
defer cancel()
// Wait for deployment to be ready
for {
deployment, err := clientset.AppsV1().Deployments("default").Get(ctx, "webapp", metav1.GetOptions{})
if err != nil {
t.Fatal(err)
}
if deployment.Status.ReadyReplicas == *deployment.Spec.Replicas {
break
}
select {
case <-ctx.Done():
t.Fatal("Timeout waiting for deployment")
case <-time.After(5 * time.Second):
}
}
}
End-to-End Testing
E2E with Cypress/Playwright
# e2e-test-job.yaml
apiVersion: batch/v1
kind: Job
metadata:
name: e2e-tests
spec:
template:
spec:
containers:
- name: e2e
image: cypress/included:12.0.0
env:
- name: CYPRESS_BASE_URL
value: "http://webapp-service"
command: ["cypress", "run"]
volumeMounts:
- name: tests
mountPath: /e2e
volumes:
- name: tests
configMap:
name: e2e-tests
restartPolicy: Never
backoffLimit: 2
Smoke Tests
# smoke-test.yaml
apiVersion: batch/v1
kind: Job
metadata:
name: smoke-test
spec:
template:
spec:
containers:
- name: smoke-test
image: curlimages/curl:latest
command:
- /bin/sh
- -c
- |
set -e
echo "Testing health endpoint..."
curl -f http://webapp-service/health
echo "Testing API endpoint..."
curl -f http://webapp-service/api/status
echo "All smoke tests passed!"
restartPolicy: Never
backoffLimit: 3
Chaos Engineering
Chaos Mesh
# Install Chaos Mesh
helm repo add chaos-mesh https://charts.chaos-mesh.org
helm install chaos-mesh chaos-mesh/chaos-mesh \
--namespace chaos-testing \
--create-namespace
Pod Chaos Experiments
# pod-failure.yaml
apiVersion: chaos-mesh.org/v1alpha1
kind: PodChaos
metadata:
name: pod-failure-test
namespace: chaos-testing
spec:
action: pod-failure
mode: one
duration: "60s"
selector:
namespaces:
- production
labelSelectors:
app: webapp
scheduler:
cron: "@every 2h"
---
# pod-kill.yaml
apiVersion: chaos-mesh.org/v1alpha1
kind: PodChaos
metadata:
name: pod-kill-test
spec:
action: pod-kill
mode: fixed-percent
value: "30"
duration: "30s"
selector:
namespaces:
- production
labelSelectors:
app: webapp
Network Chaos
# network-delay.yaml
apiVersion: chaos-mesh.org/v1alpha1
kind: NetworkChaos
metadata:
name: network-delay-test
spec:
action: delay
mode: all
selector:
namespaces:
- production
labelSelectors:
app: webapp
delay:
latency: "200ms"
correlation: "50"
jitter: "50ms"
duration: "5m"
---
# network-partition.yaml
apiVersion: chaos-mesh.org/v1alpha1
kind: NetworkChaos
metadata:
name: network-partition-test
spec:
action: partition
mode: all
selector:
namespaces:
- production
labelSelectors:
app: frontend
direction: to
target:
selector:
namespaces:
- production
labelSelectors:
app: backend
duration: "2m"
Stress Testing
# stress-cpu.yaml
apiVersion: chaos-mesh.org/v1alpha1
kind: StressChaos
metadata:
name: cpu-stress-test
spec:
mode: one
selector:
namespaces:
- production
labelSelectors:
app: webapp
stressors:
cpu:
workers: 2
load: 80
duration: "5m"
---
# stress-memory.yaml
apiVersion: chaos-mesh.org/v1alpha1
kind: StressChaos
metadata:
name: memory-stress-test
spec:
mode: one
selector:
namespaces:
- production
labelSelectors:
app: webapp
stressors:
memory:
workers: 2
size: "256MB"
duration: "5m"
LitmusChaos
# litmus-experiment.yaml
apiVersion: litmuschaos.io/v1alpha1
kind: ChaosEngine
metadata:
name: webapp-chaos
namespace: production
spec:
appinfo:
appns: production
applabel: app=webapp
appkind: deployment
chaosServiceAccount: litmus-admin
experiments:
- name: pod-delete
spec:
components:
env:
- name: TOTAL_CHAOS_DURATION
value: "30"
- name: CHAOS_INTERVAL
value: "10"
- name: FORCE
value: "false"
CI/CD Integration
GitHub Actions Pipeline
# .github/workflows/k8s-test.yaml
name: Kubernetes Tests
on:
push:
branches: [main]
pull_request:
branches: [main]
jobs:
validate:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Validate manifests
uses: instrumenta/kubeval-action@master
with:
files: ./manifests
- name: Policy tests
run: |
conftest test ./manifests --policy ./policy
integration:
runs-on: ubuntu-latest
needs: validate
steps:
- uses: actions/checkout@v4
- name: Create Kind cluster
uses: helm/kind-action@v1
- name: Deploy application
run: |
kubectl apply -f ./manifests
kubectl wait --for=condition=ready pod -l app=webapp --timeout=120s
- name: Run integration tests
run: |
kubectl apply -f ./tests/smoke-test.yaml
kubectl wait --for=condition=complete job/smoke-test --timeout=60s
- name: Run E2E tests
run: |
npm run test:e2e
chaos:
runs-on: ubuntu-latest
needs: integration
if: github.ref == 'refs/heads/main'
steps:
- uses: actions/checkout@v4
- name: Create Kind cluster
uses: helm/kind-action@v1
- name: Install Chaos Mesh
run: |
helm repo add chaos-mesh https://charts.chaos-mesh.org
helm install chaos-mesh chaos-mesh/chaos-mesh -n chaos-testing --create-namespace
- name: Deploy application
run: kubectl apply -f ./manifests
- name: Run chaos experiments
run: |
kubectl apply -f ./chaos/pod-failure.yaml
sleep 120
kubectl get pods -l app=webapp
GitLab CI Pipeline
# .gitlab-ci.yml
stages:
- validate
- test
- chaos
validate-manifests:
stage: validate
image: instrumenta/kubeval
script:
- kubeval --strict ./manifests/*.yaml
policy-check:
stage: validate
image: openpolicyagent/conftest
script:
- conftest test ./manifests --policy ./policy
integration-tests:
stage: test
image: bitnami/kubectl
services:
- name: docker:dind
before_script:
- kind create cluster
script:
- kubectl apply -f ./manifests
- kubectl wait --for=condition=ready pod -l app=webapp --timeout=120s
- kubectl apply -f ./tests/smoke-test.yaml
- kubectl wait --for=condition=complete job/smoke-test --timeout=60s
chaos-tests:
stage: chaos
only:
- main
script:
- kubectl apply -f ./chaos/experiments/
- sleep 300
- ./scripts/verify-recovery.sh
Load Testing
k6 Load Tests
// load-test.js
import http from 'k6/http';
import { check, sleep } from 'k6';
export const options = {
stages: [
{ duration: '2m', target: 100 },
{ duration: '5m', target: 100 },
{ duration: '2m', target: 200 },
{ duration: '5m', target: 200 },
{ duration: '2m', target: 0 },
],
thresholds: {
http_req_duration: ['p(95)<500'],
http_req_failed: ['rate<0.01'],
},
};
export default function () {
const res = http.get('http://webapp-service/api/data');
check(res, {
'status is 200': (r) => r.status === 200,
'response time < 500ms': (r) => r.timings.duration < 500,
});
sleep(1);
}
# k6-job.yaml
apiVersion: batch/v1
kind: Job
metadata:
name: load-test
spec:
template:
spec:
containers:
- name: k6
image: grafana/k6:latest
command: ["k6", "run", "/scripts/load-test.js"]
volumeMounts:
- name: scripts
mountPath: /scripts
volumes:
- name: scripts
configMap:
name: k6-scripts
restartPolicy: Never
Security Testing
Trivy Container Scanning
# trivy-scan-job.yaml
apiVersion: batch/v1
kind: Job
metadata:
name: trivy-scan
spec:
template:
spec:
containers:
- name: trivy
image: aquasec/trivy:latest
command:
- trivy
- image
- --severity
- HIGH,CRITICAL
- --exit-code
- "1"
- myapp:latest
restartPolicy: Never
Kube-bench Security Audit
# Run kube-bench
kubectl apply -f https://raw.githubusercontent.com/aquasecurity/kube-bench/main/job.yaml
# Check results
kubectl logs -l app=kube-bench
Key Takeaways
- Test pyramid: Unit tests at base, E2E at top
- Validate manifests with kubeconform and conftest
- Integration tests use Kind for ephemeral clusters
- Chaos engineering validates resilience with Chaos Mesh or Litmus
- CI/CD integration automates testing on every change
- Load testing with k6 ensures performance under stress
- Security scanning catches vulnerabilities early
Next Steps
Now that you understand QA practices, you’re ready to explore Authentication and RBAC in the next post for securing your Kubernetes cluster.