501 lines
16 KiB
Go
501 lines
16 KiB
Go
package k8s
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"log/slog"
|
|
"time"
|
|
|
|
corev1 "k8s.io/api/core/v1"
|
|
k8serrors "k8s.io/apimachinery/pkg/api/errors"
|
|
"k8s.io/apimachinery/pkg/api/resource"
|
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
|
"k8s.io/apimachinery/pkg/runtime/schema"
|
|
|
|
"github.com/iliaivanov/spec-kit-remote/cmd/dev-pod-api/internal/model"
|
|
)
|
|
|
|
var (
|
|
ErrPodAlreadyExists = errors.New("pod already exists")
|
|
ErrPodNotFound = errors.New("pod not found")
|
|
ErrQuotaExceeded = errors.New("quota exceeded")
|
|
)
|
|
|
|
var traefikIngressRouteGVR = schema.GroupVersionResource{
|
|
Group: "traefik.io",
|
|
Version: "v1alpha1",
|
|
Resource: "ingressroutes",
|
|
}
|
|
|
|
var traefikMiddlewareGVR = schema.GroupVersionResource{
|
|
Group: "traefik.io",
|
|
Version: "v1alpha1",
|
|
Resource: "middlewares",
|
|
}
|
|
|
|
// CreatePodOpts holds parameters for creating a complete pod with all resources.
|
|
type CreatePodOpts struct {
|
|
User string
|
|
Pod string
|
|
Tools string
|
|
Task string
|
|
CPUReq string
|
|
CPULimit string
|
|
MemReq string
|
|
MemLimit string
|
|
MaxConcurrentPods int // 0 = no limit
|
|
MaxCPUPerPod int // 0 = no limit
|
|
MaxRAMGBPerPod int // 0 = no limit
|
|
ForgejoToken string
|
|
TailscaleKey string
|
|
}
|
|
|
|
// FetchVPNKey reads the VPN gateway key from the configured secret in the gateway namespace.
|
|
func (c *Client) FetchVPNKey(ctx context.Context) (string, error) {
|
|
secret, err := c.Clientset.CoreV1().Secrets(c.Config.VPNGatewayNS).Get(
|
|
ctx, c.Config.VPNGatewaySecret, metav1.GetOptions{},
|
|
)
|
|
if err != nil {
|
|
return "", fmt.Errorf("get vpn gateway secret %s/%s: %w",
|
|
c.Config.VPNGatewayNS, c.Config.VPNGatewaySecret, err)
|
|
}
|
|
|
|
key, ok := secret.Data["VPN_GATEWAY_KEY"]
|
|
if !ok {
|
|
return "", fmt.Errorf("VPN_GATEWAY_KEY field not found in secret %s/%s",
|
|
c.Config.VPNGatewayNS, c.Config.VPNGatewaySecret)
|
|
}
|
|
return string(key), nil
|
|
}
|
|
|
|
// ValidatePodQuota checks per-pod CPU and RAM limits against the user's quota.
|
|
func ValidatePodQuota(cpuLimit string, maxCPUPerPod int, memLimit string, maxRAMGBPerPod int) error {
|
|
if maxCPUPerPod > 0 && cpuLimit != "" {
|
|
cpu, err := resource.ParseQuantity(cpuLimit)
|
|
if err != nil {
|
|
return fmt.Errorf("parse cpu_limit %q: %w", cpuLimit, err)
|
|
}
|
|
maxCPU := resource.MustParse(fmt.Sprintf("%d", maxCPUPerPod))
|
|
if cpu.Cmp(maxCPU) > 0 {
|
|
return fmt.Errorf("%w: cpu_limit %s exceeds max %d", ErrQuotaExceeded, cpuLimit, maxCPUPerPod)
|
|
}
|
|
}
|
|
if maxRAMGBPerPod > 0 && memLimit != "" {
|
|
mem, err := resource.ParseQuantity(memLimit)
|
|
if err != nil {
|
|
return fmt.Errorf("parse mem_limit %q: %w", memLimit, err)
|
|
}
|
|
maxMem := resource.MustParse(fmt.Sprintf("%dGi", maxRAMGBPerPod))
|
|
if mem.Cmp(maxMem) > 0 {
|
|
return fmt.Errorf("%w: mem_limit %s exceeds max %dGi", ErrQuotaExceeded, memLimit, maxRAMGBPerPod)
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// CreatePod creates a complete dev pod with all associated k8s resources.
|
|
// Shared resources (namespace, PVC, secrets, configmap, network policy) are created idempotently.
|
|
// Pod-specific resources (pod, service, ingress) are cleaned up on partial failure.
|
|
func (c *Client) CreatePod(ctx context.Context, opts CreatePodOpts) (*model.Pod, error) {
|
|
ns := model.NamespaceName(opts.User)
|
|
|
|
// Check quota: count existing pods for this user
|
|
if opts.MaxConcurrentPods > 0 {
|
|
existingPods, err := c.Clientset.CoreV1().Pods(ns).List(ctx, metav1.ListOptions{
|
|
LabelSelector: "app=dev-pod",
|
|
})
|
|
if err != nil && !k8serrors.IsNotFound(err) {
|
|
return nil, fmt.Errorf("list existing pods: %w", err)
|
|
}
|
|
if existingPods != nil && len(existingPods.Items) >= opts.MaxConcurrentPods {
|
|
return nil, ErrQuotaExceeded
|
|
}
|
|
}
|
|
|
|
// Check per-pod resource quota
|
|
if err := ValidatePodQuota(opts.CPULimit, opts.MaxCPUPerPod, opts.MemLimit, opts.MaxRAMGBPerPod); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Check for duplicate pod
|
|
podFullName := model.PodFullName(opts.Pod)
|
|
_, err := c.Clientset.CoreV1().Pods(ns).Get(ctx, podFullName, metav1.GetOptions{})
|
|
if err == nil {
|
|
return nil, ErrPodAlreadyExists
|
|
}
|
|
if !k8serrors.IsNotFound(err) {
|
|
return nil, fmt.Errorf("check existing pod: %w", err)
|
|
}
|
|
|
|
// Fetch VPN key
|
|
vpnKey, err := c.FetchVPNKey(ctx)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("fetch vpn key: %w", err)
|
|
}
|
|
|
|
// Create shared resources (idempotent)
|
|
if err := c.EnsureNamespace(ctx, opts.User); err != nil {
|
|
return nil, fmt.Errorf("ensure namespace: %w", err)
|
|
}
|
|
|
|
if err := c.ensureSharedResources(ctx, opts.User, vpnKey, opts.ForgejoToken, opts.TailscaleKey); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Create per-pod PVC
|
|
pvc := PVCTemplate(opts.User, opts.Pod)
|
|
if _, err := c.Clientset.CoreV1().PersistentVolumeClaims(ns).Create(ctx, pvc, metav1.CreateOptions{}); err != nil {
|
|
if !k8serrors.IsAlreadyExists(err) {
|
|
return nil, fmt.Errorf("create per-pod pvc: %w", err)
|
|
}
|
|
}
|
|
|
|
// Create pod-specific resources with cleanup on failure
|
|
podOpts := PodOpts{
|
|
User: opts.User,
|
|
Pod: opts.Pod,
|
|
Tools: opts.Tools,
|
|
Task: opts.Task,
|
|
CPUReq: opts.CPUReq,
|
|
CPULimit: opts.CPULimit,
|
|
MemReq: opts.MemReq,
|
|
MemLimit: opts.MemLimit,
|
|
VPNKey: vpnKey,
|
|
AnthropicKey: c.Config.AnthropicKey,
|
|
OpenAIKey: c.Config.OpenAIKey,
|
|
ForgejoToken: opts.ForgejoToken,
|
|
TailscaleKey: opts.TailscaleKey,
|
|
}
|
|
|
|
var cleanups []func()
|
|
doCleanup := func() {
|
|
for i := len(cleanups) - 1; i >= 0; i-- {
|
|
cleanups[i]()
|
|
}
|
|
}
|
|
|
|
// Register PVC cleanup
|
|
cleanups = append(cleanups, func() {
|
|
if delErr := c.Clientset.CoreV1().PersistentVolumeClaims(ns).Delete(ctx, pvc.Name, metav1.DeleteOptions{}); delErr != nil {
|
|
slog.Warn("cleanup: failed to delete pvc", "pvc", pvc.Name, "error", delErr)
|
|
}
|
|
})
|
|
|
|
// Create Pod
|
|
pod := PodTemplate(c.Config, podOpts)
|
|
if _, err := c.Clientset.CoreV1().Pods(ns).Create(ctx, pod, metav1.CreateOptions{}); err != nil {
|
|
return nil, fmt.Errorf("create pod: %w", err)
|
|
}
|
|
cleanups = append(cleanups, func() {
|
|
if delErr := c.Clientset.CoreV1().Pods(ns).Delete(ctx, pod.Name, metav1.DeleteOptions{}); delErr != nil {
|
|
slog.Warn("cleanup: failed to delete pod", "pod", pod.Name, "error", delErr)
|
|
}
|
|
})
|
|
|
|
// Create Service
|
|
svc := ServiceTemplate(opts.User, opts.Pod)
|
|
if _, err := c.Clientset.CoreV1().Services(ns).Create(ctx, svc, metav1.CreateOptions{}); err != nil {
|
|
doCleanup()
|
|
return nil, fmt.Errorf("create service: %w", err)
|
|
}
|
|
cleanups = append(cleanups, func() {
|
|
if delErr := c.Clientset.CoreV1().Services(ns).Delete(ctx, svc.Name, metav1.DeleteOptions{}); delErr != nil {
|
|
slog.Warn("cleanup: failed to delete service", "service", svc.Name, "error", delErr)
|
|
}
|
|
})
|
|
|
|
// Create IngressRoute + Middlewares
|
|
ingressObjs := IngressTemplate(opts.User, opts.Pod, c.Config.Domain)
|
|
for _, obj := range ingressObjs {
|
|
gvr, err := traefikGVR(obj.GetKind())
|
|
if err != nil {
|
|
doCleanup()
|
|
return nil, fmt.Errorf("resolve GVR for %s: %w", obj.GetKind(), err)
|
|
}
|
|
_, err = c.Dynamic.Resource(gvr).Namespace(ns).Create(ctx, obj, metav1.CreateOptions{})
|
|
if err != nil {
|
|
if k8serrors.IsAlreadyExists(err) {
|
|
continue // shared resource already exists, don't register cleanup
|
|
}
|
|
doCleanup()
|
|
return nil, fmt.Errorf("create %s %s: %w", obj.GetKind(), obj.GetName(), err)
|
|
}
|
|
name := obj.GetName()
|
|
kind := obj.GetKind()
|
|
cleanups = append(cleanups, func() {
|
|
gvr, gvrErr := traefikGVR(kind)
|
|
if gvrErr != nil {
|
|
slog.Warn("cleanup: unknown traefik kind", "kind", kind, "error", gvrErr)
|
|
return
|
|
}
|
|
if delErr := c.Dynamic.Resource(gvr).Namespace(ns).Delete(ctx, name, metav1.DeleteOptions{}); delErr != nil {
|
|
slog.Warn("cleanup: failed to delete "+kind, "name", name, "error", delErr)
|
|
}
|
|
})
|
|
}
|
|
|
|
return &model.Pod{
|
|
User: opts.User,
|
|
Name: opts.Pod,
|
|
Tools: opts.Tools,
|
|
CPUReq: opts.CPUReq,
|
|
CPULimit: opts.CPULimit,
|
|
MemReq: opts.MemReq,
|
|
MemLimit: opts.MemLimit,
|
|
Task: opts.Task,
|
|
Status: "Pending",
|
|
URL: model.PodURL(c.Config.Domain, opts.User, opts.Pod),
|
|
}, nil
|
|
}
|
|
|
|
// DeletePod removes a single pod and its associated service and ingress resources.
|
|
// Keeps the namespace and shared resources if other pods exist.
|
|
func (c *Client) DeletePod(ctx context.Context, user, pod string) error {
|
|
ns := model.NamespaceName(user)
|
|
podFullName := model.PodFullName(pod)
|
|
|
|
// Verify pod exists
|
|
_, err := c.Clientset.CoreV1().Pods(ns).Get(ctx, podFullName, metav1.GetOptions{})
|
|
if err != nil {
|
|
if k8serrors.IsNotFound(err) {
|
|
return ErrPodNotFound
|
|
}
|
|
return fmt.Errorf("get pod: %w", err)
|
|
}
|
|
|
|
// Delete pod
|
|
if err := c.Clientset.CoreV1().Pods(ns).Delete(ctx, podFullName, metav1.DeleteOptions{}); err != nil && !k8serrors.IsNotFound(err) {
|
|
return fmt.Errorf("delete pod: %w", err)
|
|
}
|
|
|
|
// Delete service
|
|
svcName := model.ServiceName(pod)
|
|
if err := c.Clientset.CoreV1().Services(ns).Delete(ctx, svcName, metav1.DeleteOptions{}); err != nil && !k8serrors.IsNotFound(err) {
|
|
return fmt.Errorf("delete service: %w", err)
|
|
}
|
|
|
|
// Delete ingress route and middlewares
|
|
ingressName := fmt.Sprintf("dev-pod-%s-ingress", pod)
|
|
if err := c.Dynamic.Resource(traefikIngressRouteGVR).Namespace(ns).Delete(ctx, ingressName, metav1.DeleteOptions{}); err != nil && !k8serrors.IsNotFound(err) {
|
|
return fmt.Errorf("delete ingress route: %w", err)
|
|
}
|
|
|
|
// Delete middlewares: basic-auth is shared, only delete pod-specific strip-prefix
|
|
stripName := fmt.Sprintf("strip-dev-%s-ralphex-prefix", pod)
|
|
if err := c.Dynamic.Resource(traefikMiddlewareGVR).Namespace(ns).Delete(ctx, stripName, metav1.DeleteOptions{}); err != nil && !k8serrors.IsNotFound(err) {
|
|
return fmt.Errorf("delete strip-prefix middleware: %w", err)
|
|
}
|
|
|
|
// Delete per-pod PVC
|
|
pvcName := PVCName(pod)
|
|
if err := c.Clientset.CoreV1().PersistentVolumeClaims(ns).Delete(ctx, pvcName, metav1.DeleteOptions{}); err != nil && !k8serrors.IsNotFound(err) {
|
|
return fmt.Errorf("delete per-pod pvc: %w", err)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// DeleteAllPods removes all pods for a user by deleting the entire namespace.
|
|
func (c *Client) DeleteAllPods(ctx context.Context, user string) error {
|
|
return c.DeleteNamespace(ctx, user)
|
|
}
|
|
|
|
// ListPods returns all dev pods for a user with status and age.
|
|
func (c *Client) ListPods(ctx context.Context, user string) ([]model.Pod, error) {
|
|
ns := model.NamespaceName(user)
|
|
|
|
podList, err := c.Clientset.CoreV1().Pods(ns).List(ctx, metav1.ListOptions{
|
|
LabelSelector: "app=dev-pod",
|
|
})
|
|
if err != nil {
|
|
if k8serrors.IsNotFound(err) {
|
|
return nil, nil
|
|
}
|
|
return nil, fmt.Errorf("list pods: %w", err)
|
|
}
|
|
|
|
result := make([]model.Pod, 0, len(podList.Items))
|
|
for _, p := range podList.Items {
|
|
podName := p.Labels["podname"]
|
|
cpuReq, cpuLimit, memReq, memLimit := containerResources(p.Spec.Containers)
|
|
result = append(result, model.Pod{
|
|
User: user,
|
|
Name: podName,
|
|
Tools: extractEnvVar(p, "dev", "DEV_TOOLS"),
|
|
CPUReq: cpuReq,
|
|
CPULimit: cpuLimit,
|
|
MemReq: memReq,
|
|
MemLimit: memLimit,
|
|
Task: extractEnvVar(p, "dev", "TASK_DESCRIPTION"),
|
|
Status: string(p.Status.Phase),
|
|
Age: formatAge(p.CreationTimestamp.Time),
|
|
URL: model.PodURL(c.Config.Domain, user, podName),
|
|
CreatedAt: p.CreationTimestamp.Time,
|
|
Labels: p.Labels,
|
|
})
|
|
}
|
|
return result, nil
|
|
}
|
|
|
|
// GetPod returns detailed info about a single dev pod.
|
|
func (c *Client) GetPod(ctx context.Context, user, pod string) (*model.Pod, error) {
|
|
ns := model.NamespaceName(user)
|
|
podFullName := model.PodFullName(pod)
|
|
|
|
p, err := c.Clientset.CoreV1().Pods(ns).Get(ctx, podFullName, metav1.GetOptions{})
|
|
if err != nil {
|
|
if k8serrors.IsNotFound(err) {
|
|
return nil, ErrPodNotFound
|
|
}
|
|
return nil, fmt.Errorf("get pod: %w", err)
|
|
}
|
|
|
|
cpuReq, cpuLimit, memReq, memLimit := containerResources(p.Spec.Containers)
|
|
return &model.Pod{
|
|
User: user,
|
|
Name: pod,
|
|
Tools: extractEnvVar(*p, "dev", "DEV_TOOLS"),
|
|
CPUReq: cpuReq,
|
|
CPULimit: cpuLimit,
|
|
MemReq: memReq,
|
|
MemLimit: memLimit,
|
|
Task: extractEnvVar(*p, "dev", "TASK_DESCRIPTION"),
|
|
Status: string(p.Status.Phase),
|
|
Age: formatAge(p.CreationTimestamp.Time),
|
|
URL: model.PodURL(c.Config.Domain, user, pod),
|
|
CreatedAt: p.CreationTimestamp.Time,
|
|
Labels: p.Labels,
|
|
}, nil
|
|
}
|
|
|
|
// ensureSharedResources creates namespace-level resources idempotently.
|
|
func (c *Client) ensureSharedResources(ctx context.Context, user, vpnKey, forgejoToken, tailscaleKey string) error {
|
|
ns := model.NamespaceName(user)
|
|
|
|
// Secrets
|
|
devSec, aiSec := SecretsTemplate(user, vpnKey, c.Config.AnthropicKey, c.Config.OpenAIKey, forgejoToken, tailscaleKey)
|
|
if _, err := c.Clientset.CoreV1().Secrets(ns).Create(ctx, devSec, metav1.CreateOptions{}); err != nil {
|
|
if k8serrors.IsAlreadyExists(err) {
|
|
existing, getErr := c.Clientset.CoreV1().Secrets(ns).Get(ctx, devSec.Name, metav1.GetOptions{})
|
|
if getErr != nil {
|
|
return fmt.Errorf("get dev-secrets for update: %w", getErr)
|
|
}
|
|
devSec.ResourceVersion = existing.ResourceVersion
|
|
if _, err = c.Clientset.CoreV1().Secrets(ns).Update(ctx, devSec, metav1.UpdateOptions{}); err != nil {
|
|
return fmt.Errorf("update dev-secrets: %w", err)
|
|
}
|
|
} else {
|
|
return fmt.Errorf("create dev-secrets: %w", err)
|
|
}
|
|
}
|
|
if _, err := c.Clientset.CoreV1().Secrets(ns).Create(ctx, aiSec, metav1.CreateOptions{}); err != nil {
|
|
if k8serrors.IsAlreadyExists(err) {
|
|
existing, getErr := c.Clientset.CoreV1().Secrets(ns).Get(ctx, aiSec.Name, metav1.GetOptions{})
|
|
if getErr != nil {
|
|
return fmt.Errorf("get ai-proxy-secrets for update: %w", getErr)
|
|
}
|
|
aiSec.ResourceVersion = existing.ResourceVersion
|
|
if _, err = c.Clientset.CoreV1().Secrets(ns).Update(ctx, aiSec, metav1.UpdateOptions{}); err != nil {
|
|
return fmt.Errorf("update ai-proxy-secrets: %w", err)
|
|
}
|
|
} else {
|
|
return fmt.Errorf("create ai-proxy-secrets: %w", err)
|
|
}
|
|
}
|
|
|
|
// Basic auth secret (for Traefik basicAuth middleware)
|
|
basicAuthSec := BasicAuthSecretTemplate(user)
|
|
if _, err := c.Clientset.CoreV1().Secrets(ns).Create(ctx, basicAuthSec, metav1.CreateOptions{}); err != nil && !k8serrors.IsAlreadyExists(err) {
|
|
return fmt.Errorf("create basic-auth secret: %w", err)
|
|
}
|
|
|
|
// ConfigMap
|
|
cm := AIProxyConfigMapTemplate(user)
|
|
if _, err := c.Clientset.CoreV1().ConfigMaps(ns).Create(ctx, cm, metav1.CreateOptions{}); err != nil {
|
|
if k8serrors.IsAlreadyExists(err) {
|
|
existing, getErr := c.Clientset.CoreV1().ConfigMaps(ns).Get(ctx, cm.Name, metav1.GetOptions{})
|
|
if getErr != nil {
|
|
return fmt.Errorf("get ai-proxy-config for update: %w", getErr)
|
|
}
|
|
cm.ResourceVersion = existing.ResourceVersion
|
|
if _, err = c.Clientset.CoreV1().ConfigMaps(ns).Update(ctx, cm, metav1.UpdateOptions{}); err != nil {
|
|
return fmt.Errorf("update ai-proxy-config: %w", err)
|
|
}
|
|
} else {
|
|
return fmt.Errorf("create ai-proxy-config: %w", err)
|
|
}
|
|
}
|
|
|
|
// NetworkPolicy
|
|
np := NetworkPolicyTemplate(user)
|
|
if _, err := c.Clientset.NetworkingV1().NetworkPolicies(ns).Create(ctx, np, metav1.CreateOptions{}); err != nil && !k8serrors.IsAlreadyExists(err) {
|
|
return fmt.Errorf("create network policy: %w", err)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// traefikGVR returns the GroupVersionResource for a Traefik CRD kind.
|
|
func traefikGVR(kind string) (schema.GroupVersionResource, error) {
|
|
switch kind {
|
|
case "IngressRoute":
|
|
return traefikIngressRouteGVR, nil
|
|
case "Middleware":
|
|
return traefikMiddlewareGVR, nil
|
|
default:
|
|
return schema.GroupVersionResource{}, fmt.Errorf("unsupported traefik kind: %s", kind)
|
|
}
|
|
}
|
|
|
|
// containerResources safely extracts CPU/memory request/limit strings from the first container.
|
|
func containerResources(containers []corev1.Container) (cpuReq, cpuLimit, memReq, memLimit string) {
|
|
if len(containers) == 0 {
|
|
return "0", "0", "0", "0"
|
|
}
|
|
c := containers[0]
|
|
return c.Resources.Requests.Cpu().String(),
|
|
c.Resources.Limits.Cpu().String(),
|
|
formatMemory(c.Resources.Requests.Memory()),
|
|
formatMemory(c.Resources.Limits.Memory())
|
|
}
|
|
|
|
// extractEnvVar finds an environment variable value from a named container in a pod.
|
|
func extractEnvVar(pod corev1.Pod, containerName, envName string) string {
|
|
for _, c := range pod.Spec.Containers {
|
|
if c.Name == containerName {
|
|
for _, e := range c.Env {
|
|
if e.Name == envName {
|
|
return e.Value
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
// formatAge returns a human-readable duration since the given time.
|
|
func formatAge(created time.Time) string {
|
|
if created.IsZero() {
|
|
return ""
|
|
}
|
|
d := time.Since(created)
|
|
switch {
|
|
case d < time.Minute:
|
|
return fmt.Sprintf("%ds", int(d.Seconds()))
|
|
case d < time.Hour:
|
|
return fmt.Sprintf("%dm", int(d.Minutes()))
|
|
case d < 24*time.Hour:
|
|
return fmt.Sprintf("%dh", int(d.Hours()))
|
|
default:
|
|
return fmt.Sprintf("%dd", int(d.Hours()/24))
|
|
}
|
|
}
|
|
|
|
// formatMemory returns a human-readable memory quantity.
|
|
func formatMemory(q *resource.Quantity) string {
|
|
if q == nil || q.IsZero() {
|
|
return "0"
|
|
}
|
|
return q.String()
|
|
}
|