dev-pod-api-build/internal/k8s/runners.go
2026-04-16 04:16:36 +00:00

248 lines
7.5 KiB
Go

package k8s
import (
"context"
"fmt"
"log/slog"
"time"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"github.com/iliaivanov/spec-kit-remote/cmd/dev-pod-api/internal/model"
)
// CreateRunnerPodOpts holds parameters for creating a runner pod.
type CreateRunnerPodOpts struct {
User string
RunnerID string
Tools string
Task string
RepoURL string
Branch string
CPUReq string
MemReq string
ForgejoRunnerToken string
ForgejoURL string
}
// CreateRunnerPod creates an ephemeral runner pod in the user's namespace.
func (c *Client) CreateRunnerPod(ctx context.Context, opts CreateRunnerPodOpts) (string, error) {
ns := model.NamespaceName(opts.User)
podName := model.RunnerPodName(opts.RunnerID)
cpuReq := opts.CPUReq
if cpuReq == "" {
cpuReq = "2"
}
memReq := opts.MemReq
if memReq == "" {
memReq = "4Gi"
}
forgejoURL := opts.ForgejoURL
if forgejoURL == "" {
forgejoURL = c.Config.ForgejoURL
}
devPodAPIURL := c.Config.DevPodAPIURL
if devPodAPIURL == "" {
devPodAPIURL = "http://dev-pod-api.dev-infra.svc:8080"
}
privileged := true
pod := &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: podName,
Namespace: ns,
Labels: map[string]string{
"app": "dev-pod-runner",
"runner-id": opts.RunnerID,
"user": opts.User,
},
},
Spec: corev1.PodSpec{
RestartPolicy: corev1.RestartPolicyNever,
Containers: []corev1.Container{
{
Name: "runner",
Image: fmt.Sprintf("%s/%s", c.Config.Registry, c.Config.GoldenImage),
Env: []corev1.EnvVar{
{Name: "RUNNER_MODE", Value: "true"},
{Name: "RUNNER_ID", Value: opts.RunnerID},
{Name: "DEV_POD_API_URL", Value: devPodAPIURL},
{Name: "FORGEJO_URL", Value: forgejoURL},
{Name: "FORGEJO_RUNNER_TOKEN", Value: opts.ForgejoRunnerToken},
{Name: "RUNNER_REPO", Value: opts.RepoURL},
{Name: "RUNNER_BRANCH", Value: opts.Branch},
{Name: "RUNNER_TASK", Value: opts.Task},
{Name: "DEV_TOOLS", Value: opts.Tools},
},
Resources: corev1.ResourceRequirements{
Requests: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse(cpuReq),
corev1.ResourceMemory: resource.MustParse(memReq),
},
Limits: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse(cpuReq),
corev1.ResourceMemory: resource.MustParse(memReq),
},
},
},
{
Name: "ipip-sidecar",
Image: fmt.Sprintf("%s/claw-ipip-tunnel:dev", c.Config.Registry),
ImagePullPolicy: corev1.PullAlways,
SecurityContext: &corev1.SecurityContext{
Privileged: &privileged,
Capabilities: &corev1.Capabilities{
Add: []corev1.Capability{"NET_ADMIN"},
},
},
Resources: corev1.ResourceRequirements{
Requests: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("10m"),
corev1.ResourceMemory: resource.MustParse("8Mi"),
},
Limits: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("50m"),
corev1.ResourceMemory: resource.MustParse("32Mi"),
},
},
Env: []corev1.EnvVar{
{Name: "POD_ID", Value: podName},
{Name: "VPN_GATEWAY_HOST", Value: fmt.Sprintf("vpn-gateway.%s.svc", c.Config.VPNGatewayNS)},
{
Name: "VPN_GATEWAY_KEY",
ValueFrom: &corev1.EnvVarSource{
SecretKeyRef: &corev1.SecretKeySelector{
LocalObjectReference: corev1.LocalObjectReference{Name: "dev-secrets"},
Key: "VPN_GATEWAY_KEY",
},
},
},
},
},
},
},
}
if _, err := c.Clientset.CoreV1().Pods(ns).Create(ctx, pod, metav1.CreateOptions{}); err != nil {
return "", fmt.Errorf("create runner pod: %w", err)
}
return podName, nil
}
// DeleteRunnerPod deletes a runner pod and its scratch PVC from the user's namespace.
func (c *Client) DeleteRunnerPod(ctx context.Context, user, podName string) error {
ns := model.NamespaceName(user)
if err := c.Clientset.CoreV1().Pods(ns).Delete(ctx, podName, metav1.DeleteOptions{}); err != nil {
slog.Warn("delete runner pod", "pod", podName, "ns", ns, "error", err)
}
pvcName := "workspace-" + podName
if err := c.Clientset.CoreV1().PersistentVolumeClaims(ns).Delete(ctx, pvcName, metav1.DeleteOptions{}); err != nil {
slog.Warn("delete runner pvc", "pvc", pvcName, "ns", ns, "error", err)
}
return nil
}
// RunnerCleanupStore is the subset of store operations needed by the cleanup goroutine.
type RunnerCleanupStore interface {
ListRunners(ctx context.Context, userFilter string, statusFilter string) ([]model.Runner, error)
UpdateRunnerStatus(ctx context.Context, id string, newStatus model.RunnerStatus, forgejoRunnerID string) error
DeleteRunner(ctx context.Context, id string) error
GetStaleRunners(ctx context.Context, ttl time.Duration) ([]model.Runner, error)
}
// RunnerCleaner polls for completed/stale runners and cleans up their k8s resources.
type RunnerCleaner struct {
k8s *Client
store RunnerCleanupStore
logger *slog.Logger
ttl time.Duration
interval time.Duration
}
// NewRunnerCleaner creates a runner cleanup goroutine.
func NewRunnerCleaner(k8s *Client, st RunnerCleanupStore, logger *slog.Logger) *RunnerCleaner {
return &RunnerCleaner{
k8s: k8s,
store: st,
logger: logger,
ttl: 2 * time.Hour,
interval: 30 * time.Second,
}
}
// Start runs the cleanup loop until the context is cancelled.
func (rc *RunnerCleaner) Start(ctx context.Context) {
rc.logger.Info("runner cleaner started", "ttl", rc.ttl, "interval", rc.interval)
ticker := time.NewTicker(rc.interval)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
rc.logger.Info("runner cleaner stopped")
return
case <-ticker.C:
rc.cleanupCompleted(ctx)
rc.cleanupStale(ctx)
}
}
}
func (rc *RunnerCleaner) cleanupCompleted(ctx context.Context) {
for _, status := range []string{"completed", "failed"} {
runners, err := rc.store.ListRunners(ctx, "", status)
if err != nil {
rc.logger.Error("list runners for cleanup", "status", status, "error", err)
continue
}
for _, r := range runners {
rc.logger.Info("cleaning up runner", "id", r.ID, "status", r.Status, "user", r.User)
if err := rc.store.UpdateRunnerStatus(ctx, r.ID, model.RunnerStatusCleanupPending, ""); err != nil {
rc.logger.Error("mark runner cleanup_pending", "id", r.ID, "error", err)
continue
}
if r.PodName != "" {
if err := rc.k8s.DeleteRunnerPod(ctx, r.User, r.PodName); err != nil {
rc.logger.Error("delete runner pod", "id", r.ID, "pod", r.PodName, "error", err)
}
}
if err := rc.store.DeleteRunner(ctx, r.ID); err != nil {
rc.logger.Error("delete runner record", "id", r.ID, "error", err)
}
}
}
}
func (rc *RunnerCleaner) cleanupStale(ctx context.Context) {
stale, err := rc.store.GetStaleRunners(ctx, rc.ttl)
if err != nil {
rc.logger.Error("get stale runners", "error", err)
return
}
for _, r := range stale {
rc.logger.Warn("force-destroying stale runner", "id", r.ID, "status", r.Status,
"age", time.Since(r.CreatedAt).Round(time.Second))
if r.PodName != "" {
if err := rc.k8s.DeleteRunnerPod(ctx, r.User, r.PodName); err != nil {
rc.logger.Error("delete stale runner pod", "id", r.ID, "error", err)
}
}
if err := rc.store.DeleteRunner(ctx, r.ID); err != nil {
rc.logger.Error("delete stale runner record", "id", r.ID, "error", err)
}
}
}