mirror of
https://github.com/thinking-machines-lab/tinker.git
synced 2026-04-23 16:54:58 +00:00
Sync contents
This commit is contained in:
parent
3e4e4e3560
commit
951d660110
32 changed files with 3895 additions and 635 deletions
480
docs/api/trainingclient.md
Normal file
480
docs/api/trainingclient.md
Normal file
|
|
@ -0,0 +1,480 @@
|
|||
# `tinker.lib.public_interfaces.training_client`
|
||||
|
||||
TrainingClient for Tinker API.
|
||||
|
||||
## `TrainingClient` Objects
|
||||
|
||||
```python
|
||||
class TrainingClient(TelemetryProvider, QueueStateObserver)
|
||||
```
|
||||
|
||||
Client for training ML models with forward/backward passes and optimization.
|
||||
|
||||
The TrainingClient corresponds to a fine-tuned model that you can train and sample from.
|
||||
You typically get one by calling `service_client.create_lora_training_client()`.
|
||||
Key methods:
|
||||
- forward_backward() - compute gradients for training
|
||||
- optim_step() - update model parameters with Adam optimizer
|
||||
- save_weights_and_get_sampling_client() - export trained model for inference
|
||||
|
||||
Args:
|
||||
holder: Internal client managing HTTP connections and async operations
|
||||
model_id: Unique identifier for the model to train. Required for training operations.
|
||||
|
||||
Example:
|
||||
```python
|
||||
training_client = service_client.create_lora_training_client(base_model="Qwen/Qwen2.5-7B")
|
||||
fwdbwd_future = training_client.forward_backward(training_data, "cross_entropy")
|
||||
optim_future = training_client.optim_step(types.AdamParams(learning_rate=1e-4))
|
||||
fwdbwd_result = fwdbwd_future.result() # Wait for gradients
|
||||
optim_result = optim_future.result() # Wait for parameter update
|
||||
sampling_client = training_client.save_weights_and_get_sampling_client("my-model")
|
||||
```
|
||||
|
||||
#### `forward`
|
||||
|
||||
```python
|
||||
@capture_exceptions(fatal=True)
|
||||
def forward(
|
||||
data: List[types.Datum],
|
||||
loss_fn: types.LossFnType,
|
||||
loss_fn_config: Dict[str, float] | None = None
|
||||
) -> APIFuture[types.ForwardBackwardOutput]
|
||||
```
|
||||
|
||||
Compute forward pass without gradients.
|
||||
|
||||
Args:
|
||||
data: List of training data samples
|
||||
loss_fn: Loss function type (e.g., "cross_entropy")
|
||||
loss_fn_config: Optional configuration for the loss function
|
||||
|
||||
Returns:
|
||||
APIFuture containing the forward pass outputs and loss
|
||||
|
||||
Example:
|
||||
```python
|
||||
data = [types.Datum(
|
||||
model_input=types.ModelInput.from_ints(tokenizer.encode("Hello")),
|
||||
loss_fn_inputs={"target_tokens": types.ModelInput.from_ints(tokenizer.encode("world"))}
|
||||
)]
|
||||
future = training_client.forward(data, "cross_entropy")
|
||||
result = await future
|
||||
print(f"Loss: {result.loss}")
|
||||
```
|
||||
|
||||
#### `forward_async`
|
||||
|
||||
```python
|
||||
async def forward_async(
|
||||
data: List[types.Datum],
|
||||
loss_fn: types.LossFnType,
|
||||
loss_fn_config: Dict[str, float] | None = None
|
||||
) -> APIFuture[types.ForwardBackwardOutput]
|
||||
```
|
||||
|
||||
Async version of forward.
|
||||
|
||||
#### `forward_backward`
|
||||
|
||||
```python
|
||||
@capture_exceptions(fatal=True)
|
||||
def forward_backward(
|
||||
data: List[types.Datum],
|
||||
loss_fn: types.LossFnType,
|
||||
loss_fn_config: Dict[str, float] | None = None
|
||||
) -> APIFuture[types.ForwardBackwardOutput]
|
||||
```
|
||||
|
||||
Compute forward pass and backward pass to calculate gradients.
|
||||
|
||||
Args:
|
||||
data: List of training data samples
|
||||
loss_fn: Loss function type (e.g., "cross_entropy")
|
||||
loss_fn_config: Optional configuration for the loss function
|
||||
|
||||
Returns:
|
||||
APIFuture containing the forward/backward outputs, loss, and gradients
|
||||
|
||||
Example:
|
||||
```python
|
||||
data = [types.Datum(
|
||||
model_input=types.ModelInput.from_ints(tokenizer.encode("Hello")),
|
||||
loss_fn_inputs={"target_tokens": types.ModelInput.from_ints(tokenizer.encode("world"))}
|
||||
)]
|
||||
|
||||
# Compute gradients
|
||||
fwdbwd_future = training_client.forward_backward(data, "cross_entropy")
|
||||
|
||||
# Update parameters
|
||||
optim_future = training_client.optim_step(
|
||||
types.AdamParams(learning_rate=1e-4)
|
||||
)
|
||||
|
||||
fwdbwd_result = await fwdbwd_future
|
||||
print(f"Loss: {fwdbwd_result.loss}")
|
||||
```
|
||||
|
||||
#### `forward_backward_async`
|
||||
|
||||
```python
|
||||
async def forward_backward_async(
|
||||
data: List[types.Datum],
|
||||
loss_fn: types.LossFnType,
|
||||
loss_fn_config: Dict[str, float] | None = None
|
||||
) -> APIFuture[types.ForwardBackwardOutput]
|
||||
```
|
||||
|
||||
Async version of forward_backward.
|
||||
|
||||
#### `forward_backward_custom`
|
||||
|
||||
```python
|
||||
@sync_only
|
||||
@capture_exceptions(fatal=True)
|
||||
def forward_backward_custom(
|
||||
data: List[types.Datum],
|
||||
loss_fn: CustomLossFnV1) -> APIFuture[types.ForwardBackwardOutput]
|
||||
```
|
||||
|
||||
Compute forward/backward with a custom loss function.
|
||||
|
||||
Allows you to define custom loss functions that operate on log probabilities.
|
||||
The custom function receives logprobs and computes loss and gradients.
|
||||
|
||||
Args:
|
||||
data: List of training data samples
|
||||
loss_fn: Custom loss function that takes (data, logprobs) and returns (loss, metrics)
|
||||
|
||||
Returns:
|
||||
APIFuture containing the forward/backward outputs with custom loss
|
||||
|
||||
Example:
|
||||
```python
|
||||
def custom_loss(data, logprobs_list):
|
||||
# Custom loss computation
|
||||
loss = torch.mean(torch.stack([torch.mean(lp) for lp in logprobs_list]))
|
||||
metrics = {"custom_metric": loss.item()}
|
||||
return loss, metrics
|
||||
|
||||
future = training_client.forward_backward_custom(data, custom_loss)
|
||||
result = future.result()
|
||||
print(f"Custom loss: {result.loss}")
|
||||
print(f"Metrics: {result.metrics}")
|
||||
```
|
||||
|
||||
#### `forward_backward_custom_async`
|
||||
|
||||
```python
|
||||
@capture_exceptions(fatal=True)
|
||||
async def forward_backward_custom_async(
|
||||
data: List[types.Datum],
|
||||
loss_fn: CustomLossFnV1) -> APIFuture[types.ForwardBackwardOutput]
|
||||
```
|
||||
|
||||
Async version of forward_backward_custom.
|
||||
|
||||
#### `optim_step`
|
||||
|
||||
```python
|
||||
@capture_exceptions(fatal=True)
|
||||
def optim_step(
|
||||
adam_params: types.AdamParams) -> APIFuture[types.OptimStepResponse]
|
||||
```
|
||||
|
||||
Update model parameters using Adam optimizer.
|
||||
|
||||
Args:
|
||||
adam_params: Adam optimizer parameters (learning_rate, betas, eps, weight_decay)
|
||||
|
||||
Returns:
|
||||
APIFuture containing optimizer step response
|
||||
|
||||
Example:
|
||||
```python
|
||||
# First compute gradients
|
||||
fwdbwd_future = training_client.forward_backward(data, "cross_entropy")
|
||||
|
||||
# Then update parameters
|
||||
optim_future = training_client.optim_step(
|
||||
types.AdamParams(
|
||||
learning_rate=1e-4,
|
||||
weight_decay=0.01
|
||||
)
|
||||
)
|
||||
|
||||
# Wait for both to complete
|
||||
fwdbwd_result = await fwdbwd_future
|
||||
optim_result = await optim_future
|
||||
```
|
||||
|
||||
#### `optim_step_async`
|
||||
|
||||
```python
|
||||
async def optim_step_async(
|
||||
adam_params: types.AdamParams) -> APIFuture[types.OptimStepResponse]
|
||||
```
|
||||
|
||||
Async version of optim_step.
|
||||
|
||||
#### `save_state`
|
||||
|
||||
```python
|
||||
@capture_exceptions(fatal=True)
|
||||
def save_state(name: str) -> APIFuture[types.SaveWeightsResponse]
|
||||
```
|
||||
|
||||
Save model weights to persistent storage.
|
||||
|
||||
Args:
|
||||
name: Name for the saved checkpoint
|
||||
|
||||
Returns:
|
||||
APIFuture containing the save response with checkpoint path
|
||||
|
||||
Example:
|
||||
```python
|
||||
# Save after training
|
||||
save_future = training_client.save_state("checkpoint-001")
|
||||
result = await save_future
|
||||
print(f"Saved to: {result.path}")
|
||||
```
|
||||
|
||||
#### `save_state_async`
|
||||
|
||||
```python
|
||||
async def save_state_async(name: str) -> APIFuture[types.SaveWeightsResponse]
|
||||
```
|
||||
|
||||
Async version of save_state.
|
||||
|
||||
#### `load_state`
|
||||
|
||||
```python
|
||||
@capture_exceptions(fatal=True)
|
||||
def load_state(path: str) -> APIFuture[types.LoadWeightsResponse]
|
||||
```
|
||||
|
||||
Load model weights from a saved checkpoint.
|
||||
|
||||
Args:
|
||||
path: Tinker path to saved weights (e.g., "tinker://run-id/weights/checkpoint-001")
|
||||
|
||||
Returns:
|
||||
APIFuture containing the load response
|
||||
|
||||
Example:
|
||||
```python
|
||||
# Load checkpoint to continue training
|
||||
load_future = training_client.load_state("tinker://run-id/weights/checkpoint-001")
|
||||
await load_future
|
||||
# Continue training from loaded state
|
||||
```
|
||||
|
||||
#### `load_state_async`
|
||||
|
||||
```python
|
||||
async def load_state_async(path: str) -> APIFuture[types.LoadWeightsResponse]
|
||||
```
|
||||
|
||||
Async version of load_state.
|
||||
|
||||
#### `load_state_with_optimizer`
|
||||
|
||||
```python
|
||||
@capture_exceptions(fatal=True)
|
||||
def load_state_with_optimizer(
|
||||
path: str) -> APIFuture[types.LoadWeightsResponse]
|
||||
```
|
||||
|
||||
Load model weights and optimizer state from a checkpoint.
|
||||
|
||||
Args:
|
||||
path: Tinker path to saved weights (e.g., "tinker://run-id/weights/checkpoint-001")
|
||||
|
||||
Returns:
|
||||
APIFuture containing the load response
|
||||
|
||||
Example:
|
||||
```python
|
||||
# Resume training with optimizer state
|
||||
load_future = training_client.load_state_with_optimizer(
|
||||
"tinker://run-id/weights/checkpoint-001"
|
||||
)
|
||||
await load_future
|
||||
# Continue training with restored optimizer momentum
|
||||
```
|
||||
|
||||
#### `load_state_with_optimizer_async`
|
||||
|
||||
```python
|
||||
async def load_state_with_optimizer_async(
|
||||
path: str) -> APIFuture[types.LoadWeightsResponse]
|
||||
```
|
||||
|
||||
Async version of load_state_with_optimizer.
|
||||
|
||||
#### `save_weights_for_sampler`
|
||||
|
||||
```python
|
||||
@capture_exceptions(fatal=True)
|
||||
def save_weights_for_sampler(
|
||||
name: str) -> APIFuture[types.SaveWeightsForSamplerResponse]
|
||||
```
|
||||
|
||||
Save model weights for use with a SamplingClient.
|
||||
|
||||
Args:
|
||||
name: Name for the saved sampler weights
|
||||
|
||||
Returns:
|
||||
APIFuture containing the save response with sampler path
|
||||
|
||||
Example:
|
||||
```python
|
||||
# Save weights for inference
|
||||
save_future = training_client.save_weights_for_sampler("sampler-001")
|
||||
result = await save_future
|
||||
print(f"Sampler weights saved to: {result.path}")
|
||||
|
||||
# Use the path to create a sampling client
|
||||
sampling_client = service_client.create_sampling_client(
|
||||
model_path=result.path
|
||||
)
|
||||
```
|
||||
|
||||
#### `save_weights_for_sampler_async`
|
||||
|
||||
```python
|
||||
async def save_weights_for_sampler_async(
|
||||
name: str) -> APIFuture[types.SaveWeightsForSamplerResponse]
|
||||
```
|
||||
|
||||
Async version of save_weights_for_sampler.
|
||||
|
||||
#### `get_info`
|
||||
|
||||
```python
|
||||
@sync_only
|
||||
@capture_exceptions(fatal=True)
|
||||
def get_info() -> types.GetInfoResponse
|
||||
```
|
||||
|
||||
Get information about the current model.
|
||||
|
||||
Returns:
|
||||
GetInfoResponse with model configuration and metadata
|
||||
|
||||
Example:
|
||||
```python
|
||||
info = training_client.get_info()
|
||||
print(f"Model ID: {info.model_data.model_id}")
|
||||
print(f"Base model: {info.model_data.model_name}")
|
||||
print(f"LoRA rank: {info.model_data.lora_rank}")
|
||||
```
|
||||
|
||||
#### `get_info_async`
|
||||
|
||||
```python
|
||||
@capture_exceptions(fatal=True)
|
||||
async def get_info_async() -> types.GetInfoResponse
|
||||
```
|
||||
|
||||
Async version of get_info.
|
||||
|
||||
#### `get_tokenizer`
|
||||
|
||||
```python
|
||||
@cache
|
||||
@capture_exceptions(fatal=True)
|
||||
def get_tokenizer() -> PreTrainedTokenizer
|
||||
```
|
||||
|
||||
Get the tokenizer for the current model.
|
||||
|
||||
Returns:
|
||||
PreTrainedTokenizer compatible with the model
|
||||
|
||||
Example:
|
||||
```python
|
||||
tokenizer = training_client.get_tokenizer()
|
||||
tokens = tokenizer.encode("Hello world")
|
||||
text = tokenizer.decode(tokens)
|
||||
```
|
||||
|
||||
#### `create_sampling_client`
|
||||
|
||||
```python
|
||||
@capture_exceptions(fatal=True)
|
||||
def create_sampling_client(
|
||||
model_path: str,
|
||||
retry_config: RetryConfig | None = None) -> SamplingClient
|
||||
```
|
||||
|
||||
Create a SamplingClient from saved weights.
|
||||
|
||||
Args:
|
||||
model_path: Tinker path to saved weights
|
||||
retry_config: Optional configuration for retrying failed requests
|
||||
|
||||
Returns:
|
||||
SamplingClient configured with the specified weights
|
||||
|
||||
Example:
|
||||
```python
|
||||
sampling_client = training_client.create_sampling_client(
|
||||
"tinker://run-id/weights/checkpoint-001"
|
||||
)
|
||||
# Use sampling_client for inference
|
||||
```
|
||||
|
||||
#### `create_sampling_client_async`
|
||||
|
||||
```python
|
||||
@capture_exceptions(fatal=True)
|
||||
async def create_sampling_client_async(
|
||||
model_path: str,
|
||||
retry_config: RetryConfig | None = None) -> SamplingClient
|
||||
```
|
||||
|
||||
Async version of create_sampling_client.
|
||||
|
||||
#### `save_weights_and_get_sampling_client`
|
||||
|
||||
```python
|
||||
@capture_exceptions(fatal=True)
|
||||
def save_weights_and_get_sampling_client(
|
||||
name: str | None = None,
|
||||
retry_config: RetryConfig | None = None) -> SamplingClient
|
||||
```
|
||||
|
||||
Save current weights and create a SamplingClient for inference.
|
||||
|
||||
Args:
|
||||
name: Optional name for the saved weights (currently ignored for ephemeral saves)
|
||||
retry_config: Optional configuration for retrying failed requests
|
||||
|
||||
Returns:
|
||||
SamplingClient configured with the current model weights
|
||||
|
||||
Example:
|
||||
```python
|
||||
# After training, create a sampling client directly
|
||||
sampling_client = training_client.save_weights_and_get_sampling_client()
|
||||
|
||||
# Now use it for inference
|
||||
prompt = types.ModelInput.from_ints(tokenizer.encode("Hello"))
|
||||
params = types.SamplingParams(max_tokens=20)
|
||||
result = sampling_client.sample(prompt, 1, params).result()
|
||||
```
|
||||
|
||||
#### `save_weights_and_get_sampling_client_async`
|
||||
|
||||
```python
|
||||
@capture_exceptions(fatal=True)
|
||||
async def save_weights_and_get_sampling_client_async(
|
||||
name: str | None = None,
|
||||
retry_config: RetryConfig | None = None) -> SamplingClient
|
||||
```
|
||||
|
||||
Async version of save_weights_and_get_sampling_client.
|
||||
Loading…
Add table
Add a link
Reference in a new issue