package k8s import ( "context" "fmt" "log/slog" "time" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "github.com/iliaivanov/spec-kit-remote/cmd/dev-pod-api/internal/model" ) // CreateRunnerPodOpts holds parameters for creating a runner pod. type CreateRunnerPodOpts struct { User string RunnerID string Tools string Task string RepoURL string Branch string CPUReq string MemReq string ForgejoRunnerToken string ForgejoURL string } // CreateRunnerPod creates an ephemeral runner pod in the user's namespace. func (c *Client) CreateRunnerPod(ctx context.Context, opts CreateRunnerPodOpts) (string, error) { ns := model.NamespaceName(opts.User) podName := model.RunnerPodName(opts.RunnerID) cpuReq := opts.CPUReq if cpuReq == "" { cpuReq = "2" } memReq := opts.MemReq if memReq == "" { memReq = "4Gi" } forgejoURL := opts.ForgejoURL if forgejoURL == "" { forgejoURL = c.Config.ForgejoURL } devPodAPIURL := c.Config.DevPodAPIURL if devPodAPIURL == "" { devPodAPIURL = "http://dev-pod-api.dev-infra.svc:8080" } privileged := true pod := &corev1.Pod{ ObjectMeta: metav1.ObjectMeta{ Name: podName, Namespace: ns, Labels: map[string]string{ "app": "dev-pod-runner", "runner-id": opts.RunnerID, "user": opts.User, }, }, Spec: corev1.PodSpec{ RestartPolicy: corev1.RestartPolicyNever, Containers: []corev1.Container{ { Name: "runner", Image: fmt.Sprintf("%s/%s", c.Config.Registry, c.Config.GoldenImage), Env: []corev1.EnvVar{ {Name: "RUNNER_MODE", Value: "true"}, {Name: "RUNNER_ID", Value: opts.RunnerID}, {Name: "DEV_POD_API_URL", Value: devPodAPIURL}, {Name: "FORGEJO_URL", Value: forgejoURL}, {Name: "FORGEJO_RUNNER_TOKEN", Value: opts.ForgejoRunnerToken}, {Name: "RUNNER_REPO", Value: opts.RepoURL}, {Name: "RUNNER_BRANCH", Value: opts.Branch}, {Name: "RUNNER_TASK", Value: opts.Task}, {Name: "DEV_TOOLS", Value: opts.Tools}, }, Resources: corev1.ResourceRequirements{ Requests: corev1.ResourceList{ corev1.ResourceCPU: resource.MustParse(cpuReq), corev1.ResourceMemory: resource.MustParse(memReq), }, Limits: corev1.ResourceList{ corev1.ResourceCPU: resource.MustParse(cpuReq), corev1.ResourceMemory: resource.MustParse(memReq), }, }, }, { Name: "ipip-sidecar", Image: fmt.Sprintf("%s/claw-ipip-tunnel:dev", c.Config.Registry), ImagePullPolicy: corev1.PullAlways, SecurityContext: &corev1.SecurityContext{ Privileged: &privileged, Capabilities: &corev1.Capabilities{ Add: []corev1.Capability{"NET_ADMIN"}, }, }, Resources: corev1.ResourceRequirements{ Requests: corev1.ResourceList{ corev1.ResourceCPU: resource.MustParse("10m"), corev1.ResourceMemory: resource.MustParse("8Mi"), }, Limits: corev1.ResourceList{ corev1.ResourceCPU: resource.MustParse("50m"), corev1.ResourceMemory: resource.MustParse("32Mi"), }, }, Env: []corev1.EnvVar{ {Name: "POD_ID", Value: podName}, {Name: "VPN_GATEWAY_HOST", Value: fmt.Sprintf("vpn-gateway.%s.svc", c.Config.VPNGatewayNS)}, { Name: "VPN_GATEWAY_KEY", ValueFrom: &corev1.EnvVarSource{ SecretKeyRef: &corev1.SecretKeySelector{ LocalObjectReference: corev1.LocalObjectReference{Name: "dev-secrets"}, Key: "VPN_GATEWAY_KEY", }, }, }, }, }, }, }, } if _, err := c.Clientset.CoreV1().Pods(ns).Create(ctx, pod, metav1.CreateOptions{}); err != nil { return "", fmt.Errorf("create runner pod: %w", err) } return podName, nil } // DeleteRunnerPod deletes a runner pod and its scratch PVC from the user's namespace. func (c *Client) DeleteRunnerPod(ctx context.Context, user, podName string) error { ns := model.NamespaceName(user) if err := c.Clientset.CoreV1().Pods(ns).Delete(ctx, podName, metav1.DeleteOptions{}); err != nil { slog.Warn("delete runner pod", "pod", podName, "ns", ns, "error", err) } pvcName := "workspace-" + podName if err := c.Clientset.CoreV1().PersistentVolumeClaims(ns).Delete(ctx, pvcName, metav1.DeleteOptions{}); err != nil { slog.Warn("delete runner pvc", "pvc", pvcName, "ns", ns, "error", err) } return nil } // RunnerCleanupStore is the subset of store operations needed by the cleanup goroutine. type RunnerCleanupStore interface { ListRunners(ctx context.Context, userFilter string, statusFilter string) ([]model.Runner, error) UpdateRunnerStatus(ctx context.Context, id string, newStatus model.RunnerStatus, forgejoRunnerID string) error DeleteRunner(ctx context.Context, id string) error GetStaleRunners(ctx context.Context, ttl time.Duration) ([]model.Runner, error) } // RunnerCleaner polls for completed/stale runners and cleans up their k8s resources. type RunnerCleaner struct { k8s *Client store RunnerCleanupStore logger *slog.Logger ttl time.Duration interval time.Duration } // NewRunnerCleaner creates a runner cleanup goroutine. func NewRunnerCleaner(k8s *Client, st RunnerCleanupStore, logger *slog.Logger) *RunnerCleaner { return &RunnerCleaner{ k8s: k8s, store: st, logger: logger, ttl: 2 * time.Hour, interval: 30 * time.Second, } } // Start runs the cleanup loop until the context is cancelled. func (rc *RunnerCleaner) Start(ctx context.Context) { rc.logger.Info("runner cleaner started", "ttl", rc.ttl, "interval", rc.interval) ticker := time.NewTicker(rc.interval) defer ticker.Stop() for { select { case <-ctx.Done(): rc.logger.Info("runner cleaner stopped") return case <-ticker.C: rc.cleanupCompleted(ctx) rc.cleanupStale(ctx) } } } func (rc *RunnerCleaner) cleanupCompleted(ctx context.Context) { for _, status := range []string{"completed", "failed"} { runners, err := rc.store.ListRunners(ctx, "", status) if err != nil { rc.logger.Error("list runners for cleanup", "status", status, "error", err) continue } for _, r := range runners { rc.logger.Info("cleaning up runner", "id", r.ID, "status", r.Status, "user", r.User) if err := rc.store.UpdateRunnerStatus(ctx, r.ID, model.RunnerStatusCleanupPending, ""); err != nil { rc.logger.Error("mark runner cleanup_pending", "id", r.ID, "error", err) continue } if r.PodName != "" { if err := rc.k8s.DeleteRunnerPod(ctx, r.User, r.PodName); err != nil { rc.logger.Error("delete runner pod", "id", r.ID, "pod", r.PodName, "error", err) } } if err := rc.store.DeleteRunner(ctx, r.ID); err != nil { rc.logger.Error("delete runner record", "id", r.ID, "error", err) } } } } func (rc *RunnerCleaner) cleanupStale(ctx context.Context) { stale, err := rc.store.GetStaleRunners(ctx, rc.ttl) if err != nil { rc.logger.Error("get stale runners", "error", err) return } for _, r := range stale { rc.logger.Warn("force-destroying stale runner", "id", r.ID, "status", r.Status, "age", time.Since(r.CreatedAt).Round(time.Second)) if r.PodName != "" { if err := rc.k8s.DeleteRunnerPod(ctx, r.User, r.PodName); err != nil { rc.logger.Error("delete stale runner pod", "id", r.ID, "error", err) } } if err := rc.store.DeleteRunner(ctx, r.ID); err != nil { rc.logger.Error("delete stale runner record", "id", r.ID, "error", err) } } }