248 lines
7.5 KiB
Go
248 lines
7.5 KiB
Go
package k8s
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"log/slog"
|
|
"time"
|
|
|
|
corev1 "k8s.io/api/core/v1"
|
|
"k8s.io/apimachinery/pkg/api/resource"
|
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
|
|
|
"github.com/iliaivanov/spec-kit-remote/cmd/dev-pod-api/internal/model"
|
|
)
|
|
|
|
// CreateRunnerPodOpts holds parameters for creating a runner pod.
|
|
type CreateRunnerPodOpts struct {
|
|
User string
|
|
RunnerID string
|
|
Tools string
|
|
Task string
|
|
RepoURL string
|
|
Branch string
|
|
CPUReq string
|
|
MemReq string
|
|
ForgejoRunnerToken string
|
|
ForgejoURL string
|
|
}
|
|
|
|
// CreateRunnerPod creates an ephemeral runner pod in the user's namespace.
|
|
func (c *Client) CreateRunnerPod(ctx context.Context, opts CreateRunnerPodOpts) (string, error) {
|
|
ns := model.NamespaceName(opts.User)
|
|
podName := model.RunnerPodName(opts.RunnerID)
|
|
|
|
cpuReq := opts.CPUReq
|
|
if cpuReq == "" {
|
|
cpuReq = "2"
|
|
}
|
|
memReq := opts.MemReq
|
|
if memReq == "" {
|
|
memReq = "4Gi"
|
|
}
|
|
|
|
forgejoURL := opts.ForgejoURL
|
|
if forgejoURL == "" {
|
|
forgejoURL = c.Config.ForgejoURL
|
|
}
|
|
|
|
devPodAPIURL := c.Config.DevPodAPIURL
|
|
if devPodAPIURL == "" {
|
|
devPodAPIURL = "http://dev-pod-api.dev-infra.svc:8080"
|
|
}
|
|
|
|
privileged := true
|
|
pod := &corev1.Pod{
|
|
ObjectMeta: metav1.ObjectMeta{
|
|
Name: podName,
|
|
Namespace: ns,
|
|
Labels: map[string]string{
|
|
"app": "dev-pod-runner",
|
|
"runner-id": opts.RunnerID,
|
|
"user": opts.User,
|
|
},
|
|
},
|
|
Spec: corev1.PodSpec{
|
|
RestartPolicy: corev1.RestartPolicyNever,
|
|
Containers: []corev1.Container{
|
|
{
|
|
Name: "runner",
|
|
Image: fmt.Sprintf("%s/%s", c.Config.Registry, c.Config.GoldenImage),
|
|
Env: []corev1.EnvVar{
|
|
{Name: "RUNNER_MODE", Value: "true"},
|
|
{Name: "RUNNER_ID", Value: opts.RunnerID},
|
|
{Name: "DEV_POD_API_URL", Value: devPodAPIURL},
|
|
{Name: "FORGEJO_URL", Value: forgejoURL},
|
|
{Name: "FORGEJO_RUNNER_TOKEN", Value: opts.ForgejoRunnerToken},
|
|
{Name: "RUNNER_REPO", Value: opts.RepoURL},
|
|
{Name: "RUNNER_BRANCH", Value: opts.Branch},
|
|
{Name: "RUNNER_TASK", Value: opts.Task},
|
|
{Name: "DEV_TOOLS", Value: opts.Tools},
|
|
},
|
|
Resources: corev1.ResourceRequirements{
|
|
Requests: corev1.ResourceList{
|
|
corev1.ResourceCPU: resource.MustParse(cpuReq),
|
|
corev1.ResourceMemory: resource.MustParse(memReq),
|
|
},
|
|
Limits: corev1.ResourceList{
|
|
corev1.ResourceCPU: resource.MustParse(cpuReq),
|
|
corev1.ResourceMemory: resource.MustParse(memReq),
|
|
},
|
|
},
|
|
},
|
|
{
|
|
Name: "ipip-sidecar",
|
|
Image: fmt.Sprintf("%s/claw-ipip-tunnel:dev", c.Config.Registry),
|
|
ImagePullPolicy: corev1.PullAlways,
|
|
SecurityContext: &corev1.SecurityContext{
|
|
Privileged: &privileged,
|
|
Capabilities: &corev1.Capabilities{
|
|
Add: []corev1.Capability{"NET_ADMIN"},
|
|
},
|
|
},
|
|
Resources: corev1.ResourceRequirements{
|
|
Requests: corev1.ResourceList{
|
|
corev1.ResourceCPU: resource.MustParse("10m"),
|
|
corev1.ResourceMemory: resource.MustParse("8Mi"),
|
|
},
|
|
Limits: corev1.ResourceList{
|
|
corev1.ResourceCPU: resource.MustParse("50m"),
|
|
corev1.ResourceMemory: resource.MustParse("32Mi"),
|
|
},
|
|
},
|
|
Env: []corev1.EnvVar{
|
|
{Name: "POD_ID", Value: podName},
|
|
{Name: "VPN_GATEWAY_HOST", Value: fmt.Sprintf("vpn-gateway.%s.svc", c.Config.VPNGatewayNS)},
|
|
{
|
|
Name: "VPN_GATEWAY_KEY",
|
|
ValueFrom: &corev1.EnvVarSource{
|
|
SecretKeyRef: &corev1.SecretKeySelector{
|
|
LocalObjectReference: corev1.LocalObjectReference{Name: "dev-secrets"},
|
|
Key: "VPN_GATEWAY_KEY",
|
|
},
|
|
},
|
|
},
|
|
},
|
|
},
|
|
},
|
|
},
|
|
}
|
|
|
|
if _, err := c.Clientset.CoreV1().Pods(ns).Create(ctx, pod, metav1.CreateOptions{}); err != nil {
|
|
return "", fmt.Errorf("create runner pod: %w", err)
|
|
}
|
|
|
|
return podName, nil
|
|
}
|
|
|
|
// DeleteRunnerPod deletes a runner pod and its scratch PVC from the user's namespace.
|
|
func (c *Client) DeleteRunnerPod(ctx context.Context, user, podName string) error {
|
|
ns := model.NamespaceName(user)
|
|
|
|
if err := c.Clientset.CoreV1().Pods(ns).Delete(ctx, podName, metav1.DeleteOptions{}); err != nil {
|
|
slog.Warn("delete runner pod", "pod", podName, "ns", ns, "error", err)
|
|
}
|
|
|
|
pvcName := "workspace-" + podName
|
|
if err := c.Clientset.CoreV1().PersistentVolumeClaims(ns).Delete(ctx, pvcName, metav1.DeleteOptions{}); err != nil {
|
|
slog.Warn("delete runner pvc", "pvc", pvcName, "ns", ns, "error", err)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// RunnerCleanupStore is the subset of store operations needed by the cleanup goroutine.
|
|
type RunnerCleanupStore interface {
|
|
ListRunners(ctx context.Context, userFilter string, statusFilter string) ([]model.Runner, error)
|
|
UpdateRunnerStatus(ctx context.Context, id string, newStatus model.RunnerStatus, forgejoRunnerID string) error
|
|
DeleteRunner(ctx context.Context, id string) error
|
|
GetStaleRunners(ctx context.Context, ttl time.Duration) ([]model.Runner, error)
|
|
}
|
|
|
|
// RunnerCleaner polls for completed/stale runners and cleans up their k8s resources.
|
|
type RunnerCleaner struct {
|
|
k8s *Client
|
|
store RunnerCleanupStore
|
|
logger *slog.Logger
|
|
ttl time.Duration
|
|
interval time.Duration
|
|
}
|
|
|
|
// NewRunnerCleaner creates a runner cleanup goroutine.
|
|
func NewRunnerCleaner(k8s *Client, st RunnerCleanupStore, logger *slog.Logger) *RunnerCleaner {
|
|
return &RunnerCleaner{
|
|
k8s: k8s,
|
|
store: st,
|
|
logger: logger,
|
|
ttl: 2 * time.Hour,
|
|
interval: 30 * time.Second,
|
|
}
|
|
}
|
|
|
|
// Start runs the cleanup loop until the context is cancelled.
|
|
func (rc *RunnerCleaner) Start(ctx context.Context) {
|
|
rc.logger.Info("runner cleaner started", "ttl", rc.ttl, "interval", rc.interval)
|
|
ticker := time.NewTicker(rc.interval)
|
|
defer ticker.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
rc.logger.Info("runner cleaner stopped")
|
|
return
|
|
case <-ticker.C:
|
|
rc.cleanupCompleted(ctx)
|
|
rc.cleanupStale(ctx)
|
|
}
|
|
}
|
|
}
|
|
|
|
func (rc *RunnerCleaner) cleanupCompleted(ctx context.Context) {
|
|
for _, status := range []string{"completed", "failed"} {
|
|
runners, err := rc.store.ListRunners(ctx, "", status)
|
|
if err != nil {
|
|
rc.logger.Error("list runners for cleanup", "status", status, "error", err)
|
|
continue
|
|
}
|
|
for _, r := range runners {
|
|
rc.logger.Info("cleaning up runner", "id", r.ID, "status", r.Status, "user", r.User)
|
|
|
|
if err := rc.store.UpdateRunnerStatus(ctx, r.ID, model.RunnerStatusCleanupPending, ""); err != nil {
|
|
rc.logger.Error("mark runner cleanup_pending", "id", r.ID, "error", err)
|
|
continue
|
|
}
|
|
|
|
if r.PodName != "" {
|
|
if err := rc.k8s.DeleteRunnerPod(ctx, r.User, r.PodName); err != nil {
|
|
rc.logger.Error("delete runner pod", "id", r.ID, "pod", r.PodName, "error", err)
|
|
}
|
|
}
|
|
|
|
if err := rc.store.DeleteRunner(ctx, r.ID); err != nil {
|
|
rc.logger.Error("delete runner record", "id", r.ID, "error", err)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func (rc *RunnerCleaner) cleanupStale(ctx context.Context) {
|
|
stale, err := rc.store.GetStaleRunners(ctx, rc.ttl)
|
|
if err != nil {
|
|
rc.logger.Error("get stale runners", "error", err)
|
|
return
|
|
}
|
|
for _, r := range stale {
|
|
rc.logger.Warn("force-destroying stale runner", "id", r.ID, "status", r.Status,
|
|
"age", time.Since(r.CreatedAt).Round(time.Second))
|
|
|
|
if r.PodName != "" {
|
|
if err := rc.k8s.DeleteRunnerPod(ctx, r.User, r.PodName); err != nil {
|
|
rc.logger.Error("delete stale runner pod", "id", r.ID, "error", err)
|
|
}
|
|
}
|
|
|
|
if err := rc.store.DeleteRunner(ctx, r.ID); err != nil {
|
|
rc.logger.Error("delete stale runner record", "id", r.ID, "error", err)
|
|
}
|
|
}
|
|
}
|