mirror of
https://github.com/collinear-ai/yc-bench.git
synced 2026-05-02 17:45:54 +00:00
365 lines
21 KiB
Python
365 lines
21 KiB
Python
"""Realistic AI-startup task titles and descriptions, keyed by domain.
|
|
|
|
Each domain has a pool of (title, description) tuples. The generator picks
|
|
from these deterministically using the seeded RNG, cycling if the pool is
|
|
exhausted.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
from ..db.models.company import Domain
|
|
|
|
TASK_POOL: dict[Domain, list[tuple[str, str]]] = {
|
|
Domain.SYSTEM: [
|
|
(
|
|
"Set Up GPU-Aware K8s Cluster with Auto-Scaling",
|
|
"Deploy a Kubernetes cluster with NVIDIA GPU operator, node auto-scaling based on inference queue depth, and spot instance fallback for training workloads.",
|
|
),
|
|
(
|
|
"Build CI/CD Pipeline for ML Model Registry",
|
|
"Create a CI pipeline that runs training validation, pushes versioned model artifacts to a registry, and auto-deploys to a staging inference endpoint.",
|
|
),
|
|
(
|
|
"Implement Blue-Green Deployment for LLM Serving",
|
|
"Set up zero-downtime model swaps for a vLLM serving cluster with automated rollback triggered by latency and error-rate thresholds.",
|
|
),
|
|
(
|
|
"Deploy Observability Stack for AI Workloads",
|
|
"Stand up Grafana, Prometheus, and OpenTelemetry with custom dashboards tracking GPU utilization, token throughput, time-to-first-token, and per-request cost.",
|
|
),
|
|
(
|
|
"Terraform Multi-Region Inference Infrastructure",
|
|
"Write IaC modules to provision inference endpoints across 3+ regions with global load balancing, failover routing, and centralized logging.",
|
|
),
|
|
(
|
|
"Container Image Optimization for ML Serving",
|
|
"Reduce Docker image sizes for PyTorch/CUDA serving containers from 15 GB to under 4 GB using multi-stage builds and distroless bases to cut cold-start times.",
|
|
),
|
|
(
|
|
"Implement Secret Rotation and API Key Management",
|
|
"Build an automated secret rotation system for API keys, database credentials, and model provider tokens across staging and production environments.",
|
|
),
|
|
(
|
|
"Set Up Cost Monitoring and GPU Budget Alerts",
|
|
"Integrate cloud billing APIs with a dashboard showing per-team GPU spend, cost-per-inference breakdowns, and automated alerts when daily spend exceeds thresholds.",
|
|
),
|
|
(
|
|
"Build Canary Release Pipeline for Embedding Models",
|
|
"Implement a canary deployment system that gradually shifts traffic to new embedding model versions, comparing retrieval quality metrics in real time.",
|
|
),
|
|
(
|
|
"Migrate Inference Workloads to Serverless GPU",
|
|
"Evaluate and migrate bursty inference workloads to serverless GPU providers, benchmarking cold-start latency against always-on instances.",
|
|
),
|
|
(
|
|
"Implement Disaster Recovery for Training Checkpoints",
|
|
"Design a cross-region checkpoint backup system with automated integrity verification, ensuring training runs can resume within 15 minutes of any single-region failure.",
|
|
),
|
|
(
|
|
"Build Internal Developer Platform for ML Engineers",
|
|
"Create a self-service portal where ML engineers can request GPU instances, spin up Jupyter environments, and launch training jobs without touching infrastructure.",
|
|
),
|
|
],
|
|
Domain.RESEARCH: [
|
|
(
|
|
"Design Benchmark for Legal Document QA",
|
|
"Create a benchmark suite of 2,000+ annotated legal questions across contract law and compliance, with human-expert baselines and an automated evaluation harness.",
|
|
),
|
|
(
|
|
"Investigate MoE Routing for Multilingual Models",
|
|
"Research and prototype alternative Mixture-of-Experts routing strategies that improve expert utilization for low-resource languages without degrading high-resource performance.",
|
|
),
|
|
(
|
|
"Reproduce and Extend Speculative Decoding Results",
|
|
"Replicate speculative decoding paper results on Llama-3 class models, then test novel draft model architectures that improve acceptance rates on code generation.",
|
|
),
|
|
(
|
|
"Develop RAG Hallucination Detection Framework",
|
|
"Build a systematic evaluation pipeline measuring faithfulness, relevance, and attribution accuracy for retrieval-augmented generation systems.",
|
|
),
|
|
(
|
|
"Prototype LoRA Merging for Multi-Tenant Serving",
|
|
"Research methods for dynamically composing multiple LoRA adapters at inference time, measuring quality degradation versus serving separate fine-tuned models.",
|
|
),
|
|
(
|
|
"Benchmark Long-Context Retrieval Across 128K Models",
|
|
"Systematically evaluate needle-in-a-haystack and multi-hop reasoning performance across frontier models at various context lengths with reproducible results.",
|
|
),
|
|
(
|
|
"Investigate Synthetic Data Quality for Code Generation",
|
|
"Develop automated quality scoring methods for synthetically generated code training data, correlating filter thresholds with downstream model performance.",
|
|
),
|
|
(
|
|
"Research KV-Cache Compression Techniques",
|
|
"Prototype and benchmark KV-cache eviction and quantization strategies for long-running conversational agents under fixed memory budgets.",
|
|
),
|
|
(
|
|
"Build Ablation Study Framework for Prompt Engineering",
|
|
"Create an experimentation harness for testing prompt variations across multiple models and tasks with statistical significance testing and cost tracking.",
|
|
),
|
|
(
|
|
"Explore Constitutional AI for Domain-Specific Safety",
|
|
"Adapt constitutional AI methods to create a self-improving safety filter for a healthcare chatbot, defining domain-specific principles and measuring accuracy.",
|
|
),
|
|
(
|
|
"Develop Novel Chunking Strategies for Technical RAG",
|
|
"Research and benchmark alternative document chunking methods—semantic, AST-aware, sliding window—specifically for API documentation and code repositories.",
|
|
),
|
|
(
|
|
"Prototype Test-Time Compute Scaling for Math Reasoning",
|
|
"Implement best-of-N sampling, tree search, and self-verification approaches for math reasoning, measuring the compute-accuracy Pareto frontier.",
|
|
),
|
|
],
|
|
Domain.DATA: [
|
|
(
|
|
"Build Web Scraping Pipeline for Industry News Corpus",
|
|
"Design a pipeline that crawls 50+ AI/tech news sources daily, deduplicates articles, extracts structured metadata, and loads clean text into a vector store.",
|
|
),
|
|
(
|
|
"Create Annotation Platform for Dialogue Quality",
|
|
"Build an annotation workflow where human raters score LLM conversation logs on helpfulness, accuracy, and safety, with inter-rater agreement tracking.",
|
|
),
|
|
(
|
|
"Implement PII Detection and Redaction Pipeline",
|
|
"Deploy a pipeline to detect and redact personally identifiable information from training data, with audit logging and configurable redaction strategies.",
|
|
),
|
|
(
|
|
"Curate Instruction-Tuning Dataset from Internal Docs",
|
|
"Extract, clean, and convert 10,000+ pages of internal documentation into high-quality instruction-response pairs suitable for fine-tuning.",
|
|
),
|
|
(
|
|
"Build Data Quality Monitoring for Feature Store",
|
|
"Implement data validation checks on streaming feature pipelines, alerting on schema drift, null-rate spikes, and distribution shifts before they affect models.",
|
|
),
|
|
(
|
|
"Design ETL Pipeline for Multi-Modal Training Data",
|
|
"Build a DAG pipeline that ingests images, PDFs, and structured data, applies OCR and layout detection, and produces unified records for vision-language training.",
|
|
),
|
|
(
|
|
"Implement Deduplication for Large Text Corpora",
|
|
"Deploy MinHash LSH-based near-deduplication at scale for 100M+ documents with configurable similarity thresholds and a review UI for borderline cases.",
|
|
),
|
|
(
|
|
"Build Synthetic Data Pipeline for Rare Edge Cases",
|
|
"Create a system that uses frontier LLMs to generate realistic synthetic examples for underrepresented categories in a classification dataset.",
|
|
),
|
|
(
|
|
"Create Data Versioning and Lineage Tracking System",
|
|
"Set up data versioning integrated with the ML training pipeline so every model checkpoint can be traced back to the exact dataset snapshot used.",
|
|
),
|
|
(
|
|
"Build Customer Feedback Loop into Training Pipeline",
|
|
"Implement a system where end-user thumbs-up/down signals are routed, reviewed, and selectively incorporated into fine-tuning datasets with human approval.",
|
|
),
|
|
(
|
|
"Migrate Legacy Warehouse to ML-Ready Lakehouse",
|
|
"Transform and migrate 5 years of product analytics data from a legacy SQL warehouse into a Parquet-based lakehouse optimized for feature engineering.",
|
|
),
|
|
],
|
|
Domain.FRONTEND: [
|
|
(
|
|
"Build Interactive LLM Playground with Streaming",
|
|
"Create a web app where users test multiple LLM providers side-by-side with streaming output, adjustable parameters, and conversation history persistence.",
|
|
),
|
|
(
|
|
"Design Admin Dashboard for AI Agent Monitoring",
|
|
"Build a dashboard showing real-time agent execution traces, tool call sequences, token usage graphs, and cost breakdowns with drill-down filtering.",
|
|
),
|
|
(
|
|
"Create Document Chat Interface for RAG Product",
|
|
"Implement a drag-and-drop document upload UI with a conversational interface showing source citations, confidence indicators, and reference highlighting.",
|
|
),
|
|
(
|
|
"Build Annotation Review and Approval Interface",
|
|
"Design a UI for data team leads to review annotator work, resolve disagreements, view agreement stats, and approve batches for training inclusion.",
|
|
),
|
|
(
|
|
"Implement Prompt Management Studio",
|
|
"Build a collaborative app where teams version, test, and A/B deploy prompt templates with visual diffs, rollback, and per-version performance analytics.",
|
|
),
|
|
(
|
|
"Create Customer-Facing AI Usage Analytics Dashboard",
|
|
"Build an embeddable dashboard showing API call volumes, latency percentiles, token consumption, and cost trends for enterprise customers.",
|
|
),
|
|
(
|
|
"Build Visual Pipeline Editor for No-Code AI Workflows",
|
|
"Create a node-based drag-and-drop editor where non-technical users chain data sources, LLM calls, and output actions into automated AI workflows.",
|
|
),
|
|
(
|
|
"Design Chat Widget for Website Embedding",
|
|
"Build a lightweight, brandable chat widget under 50 KB that customers embed on their sites, with streaming responses and escalation-to-human capability.",
|
|
),
|
|
(
|
|
"Build Model Comparison Results Viewer",
|
|
"Create a web interface displaying benchmark results across models in interactive tables and charts with filtering by task type and model size.",
|
|
),
|
|
(
|
|
"Implement Real-Time Collaboration for AI Writing Tool",
|
|
"Add multiplayer editing to an AI writing tool using CRDTs, with per-user cursors, AI suggestion tracking, and version history.",
|
|
),
|
|
(
|
|
"Create Enterprise RAG Onboarding Wizard",
|
|
"Build a step-by-step setup wizard guiding enterprise customers through connecting data sources, configuring chunking, testing retrieval, and deploying their endpoint.",
|
|
),
|
|
],
|
|
Domain.BACKEND: [
|
|
(
|
|
"Build Multi-Tenant LLM Gateway with Rate Limiting",
|
|
"Implement an API gateway that proxies requests to multiple LLM providers, enforces per-tenant rate limits, tracks usage, and handles automatic failover.",
|
|
),
|
|
(
|
|
"Implement OAuth2 + SAML SSO for Enterprise Platform",
|
|
"Add enterprise authentication supporting SAML 2.0, OIDC, and SCIM provisioning for customers integrating with their identity provider.",
|
|
),
|
|
(
|
|
"Design Webhook System for Async AI Job Completion",
|
|
"Build a reliable webhook delivery system with exponential backoff, signature verification, dead letter queue, and a webhook management API.",
|
|
),
|
|
(
|
|
"Create Unified Embedding API with Caching Layer",
|
|
"Build a microservice abstracting over multiple embedding providers with a Redis-backed cache, batch processing, and automatic model version migration.",
|
|
),
|
|
(
|
|
"Build Conversation Memory Service for Multi-Session Agents",
|
|
"Implement a service that stores, summarizes, and retrieves conversation history across sessions using structured storage and semantic vector search.",
|
|
),
|
|
(
|
|
"Implement Usage-Based Billing with Stripe Integration",
|
|
"Build a metering system that tracks token consumption per customer, aggregates monthly invoices, and syncs with Stripe for automated usage-based charging.",
|
|
),
|
|
(
|
|
"Create Plugin Marketplace Backend",
|
|
"Design the API and data model for a marketplace where third-party developers register, version, and distribute plugins for the AI platform.",
|
|
),
|
|
(
|
|
"Build RAG Ingestion Service with Chunking and Indexing",
|
|
"Implement an async document processing service that accepts PDFs, DOCX, and HTML, chunks them, generates embeddings, and upserts into a vector store.",
|
|
),
|
|
(
|
|
"Implement Audit Logging and Compliance API",
|
|
"Build a tamper-evident audit log system recording all AI interactions and admin actions, with an API for compliance queries and SOC 2 / HIPAA exports.",
|
|
),
|
|
(
|
|
"Design Multi-Model Routing and Fallback Service",
|
|
"Create a smart routing layer directing requests to the optimal model based on task complexity, latency requirements, and cost, with provider failover.",
|
|
),
|
|
(
|
|
"Build File Processing Service for Vision-Language Models",
|
|
"Implement an async service that accepts images and documents, runs them through vision-language models for extraction, and returns structured JSON output.",
|
|
),
|
|
(
|
|
"Implement Streaming API with Server-Sent Events",
|
|
"Build an SSE-based streaming endpoint for LLM responses with connection resumption, partial response caching, and graceful degradation.",
|
|
),
|
|
],
|
|
Domain.TRAINING: [
|
|
(
|
|
"Fine-Tune Llama-3 8B for Domain-Specific Support",
|
|
"Run supervised fine-tuning on 50K curated customer support conversations using QLoRA, targeting 15% accuracy improvement over the base model.",
|
|
),
|
|
(
|
|
"Implement RLHF Pipeline for Code Generation Model",
|
|
"Build an end-to-end RLHF pipeline with a reward model trained on human preference data and PPO training loop evaluated against HumanEval.",
|
|
),
|
|
(
|
|
"Distill GPT-4 Class Model into Efficient 3B Model",
|
|
"Use knowledge distillation with synthetic data to create a compact model retaining 90%+ teacher performance on targeted tasks at 10x lower inference cost.",
|
|
),
|
|
(
|
|
"Train Custom Embedding Model for Vertical Search",
|
|
"Fine-tune a sentence-transformers model on domain-specific query-document pairs with contrastive learning, hard negative mining, and retrieval benchmarks.",
|
|
),
|
|
(
|
|
"Build Hyperparameter Search for Fine-Tuning Jobs",
|
|
"Implement an Optuna-based HPO system searching over learning rate, LoRA rank, batch size, and data mixing ratios with early stopping.",
|
|
),
|
|
(
|
|
"Run Continued Pre-Training on Proprietary Corpus",
|
|
"Execute continued pre-training of a 7B base model on 10B tokens of domain-specific text with careful learning rate scheduling to avoid catastrophic forgetting.",
|
|
),
|
|
(
|
|
"Train Reward Model from Preference Annotations",
|
|
"Collect and process 20K pairwise preference annotations, train a Bradley-Terry reward model, and validate calibration against held-out human judgments.",
|
|
),
|
|
(
|
|
"Build Multi-GPU Training Infra with DeepSpeed",
|
|
"Set up distributed training using DeepSpeed ZeRO Stage 3 across an 8-node GPU cluster with checkpoint sharding and fault-tolerant resumption.",
|
|
),
|
|
(
|
|
"Implement DPO Fine-Tuning Pipeline",
|
|
"Build a Direct Preference Optimization pipeline as a simpler RLHF alternative, comparing quality and training stability on the same preference dataset.",
|
|
),
|
|
(
|
|
"Train Vision-Language Adapter for Document Understanding",
|
|
"Fine-tune a LoRA adapter on a VLM for extracting structured data from invoices, receipts, and forms with 95%+ field-level accuracy.",
|
|
),
|
|
(
|
|
"Build Eval-Driven Training Loop with Auto Checkpointing",
|
|
"Implement a training harness that runs benchmarks every N steps, auto-saves the best checkpoint, detects instability, and alerts on loss spikes.",
|
|
),
|
|
(
|
|
"Fine-Tune Whisper for Industry-Specific Transcription",
|
|
"Adapt Whisper-large for medical dictation using 500 hours of labeled audio, targeting 30% WER reduction on domain-specific terminology.",
|
|
),
|
|
],
|
|
Domain.HARDWARE: [
|
|
(
|
|
"Optimize LLM Inference Latency with TensorRT-LLM",
|
|
"Convert a 70B model to TensorRT-LLM with INT8/FP8 quantization, continuous batching, and paged attention, targeting sub-200ms time-to-first-token.",
|
|
),
|
|
(
|
|
"Deploy On-Device ML Model for Mobile Classification",
|
|
"Convert a PyTorch vision model to Core ML and TFLite, optimize with quantization-aware training, and benchmark on iPhone and Pixel hardware.",
|
|
),
|
|
(
|
|
"Build GPU Cluster Scheduling with Fair-Share Queuing",
|
|
"Implement a scheduler for a shared GPU cluster enforcing per-team quotas, priority queuing, preemption policies, and utilization-based chargeback.",
|
|
),
|
|
(
|
|
"Implement Quantization Pipeline (GPTQ/AWQ/GGUF)",
|
|
"Build an automated pipeline that takes any model, produces GPTQ, AWQ, and GGUF quantized variants, runs quality regression, and publishes passing models.",
|
|
),
|
|
(
|
|
"Deploy Edge Inference for Real-Time Video Analytics",
|
|
"Set up an NVIDIA Jetson-based inference node running YOLO and a lightweight LLM for on-premises real-time camera analysis with local data processing.",
|
|
),
|
|
(
|
|
"Optimize vLLM Serving for Production Workload",
|
|
"Profile and tune vLLM parameters—max batch size, KV cache, swap space, tensor parallelism—for target throughput at P99 latency SLA.",
|
|
),
|
|
(
|
|
"Build Multi-GPU Inference with Tensor Parallelism",
|
|
"Configure and benchmark a 70B+ model serving across 4-8 GPUs with tensor and pipeline parallelism, optimizing throughput versus latency tradeoffs.",
|
|
),
|
|
(
|
|
"Implement Dynamic Batching for Inference Requests",
|
|
"Build a request batching layer that groups incoming requests by sequence length and priority, maximizing GPU utilization within per-request latency SLAs.",
|
|
),
|
|
(
|
|
"Design Hybrid CPU/GPU Inference Architecture",
|
|
"Architect a system routing lightweight requests to CPU inference and complex requests to GPU instances, reducing overall compute cost by 40%.",
|
|
),
|
|
(
|
|
"Set Up Triton Inference Server for Multi-Model Serving",
|
|
"Deploy NVIDIA Triton to serve embedding, reranking, and generation models on shared GPU infrastructure with dynamic batching and concurrency control.",
|
|
),
|
|
(
|
|
"Build GPU Health Monitoring and Failover System",
|
|
"Implement a daemon detecting GPU memory errors, thermal throttling, and NVLink degradation, automatically draining affected nodes and redistributing workloads.",
|
|
),
|
|
(
|
|
"Benchmark Specialized AI Accelerators vs H100",
|
|
"Evaluate Groq, Cerebras, and custom ASICs against H100 GPUs, producing a cost-per-token and latency comparison with a migration recommendation.",
|
|
),
|
|
(
|
|
"Implement Speculative Decoding in Production Stack",
|
|
"Integrate speculative decoding with a small draft model into the existing serving infrastructure, measuring real-world throughput improvement.",
|
|
),
|
|
],
|
|
}
|
|
|
|
|
|
def pick_task_text(rng, domain: Domain) -> tuple[str, str]:
|
|
"""Deterministically pick a (title, description) for *domain* using *rng*."""
|
|
pool = TASK_POOL[domain]
|
|
idx = rng.randint(0, len(pool) - 1)
|
|
return pool[idx]
|