Convert FOB submodule to regular folder

This commit is contained in:
arihanv 2025-05-18 16:36:28 -07:00
parent 94f046ad40
commit 94825011a0
74 changed files with 4563 additions and 0 deletions

View file

@ -0,0 +1 @@
from pytorch_fob.engine import Engine

View file

@ -0,0 +1,22 @@
import argparse
from pathlib import Path
from pytorch_fob.engine.engine import Engine
def get_parser():
parser = argparse.ArgumentParser()
parser.add_argument("experiment_file", type=Path,
help="The yaml file specifying the experiment.")
return parser
def main(args: argparse.Namespace, extra_args: list[str]):
engine = Engine()
engine.parse_experiment_from_file(args.experiment_file, extra_args=extra_args)
engine.prepare_data()
if __name__ == '__main__':
parser = get_parser()
args, extra_args = parser.parse_known_args()
main(args, extra_args)

View file

@ -0,0 +1,7 @@
from pathlib import Path
from pytorch_fob.engine.engine import Engine
def repository_root() -> Path:
return Path(__file__).resolve().parent.parent

View file

@ -0,0 +1,272 @@
import math
import time
from typing import Iterable, Optional
import deepspeed
import torch
from lightning import Callback, LightningModule, Trainer
from lightning_utilities.core.rank_zero import rank_zero_only
from torch.linalg import vector_norm
from pytorch_fob.engine.utils import log_debug, log_info, log_warn, seconds_to_str
class RestrictTrainEpochs(Callback):
"""Counts number of epochs since start of training and stops if max_epochs is reached."""
def __init__(self, max_epochs: int):
super().__init__()
self.max_epochs = max_epochs
self.epochs = 0
self.skip_first = False
def on_train_start(self, trainer: Trainer, pl_module: LightningModule):
log_debug(f"Training for {self.max_epochs} epochs...")
self.epochs = 0
trainer.should_stop = False
def on_train_epoch_end(self, trainer: Trainer, pl_module: LightningModule):
if self.skip_first:
self.skip_first = False
else:
self.epochs += 1
log_debug(f"Epoch {self.epochs}/{self.max_epochs}")
# TODO: test for DDP, do we need 'trainer.strategy.reduce_boolean_decision'?
if self.epochs >= self.max_epochs:
log_debug(f"Stopping training after {self.epochs} epochs")
trainer.should_stop = True
def on_load_checkpoint(self, trainer: Trainer, pl_module: LightningModule, checkpoint):
# checkpoint loads the model at the end of the epoch, so we do not count the first epoch
self.skip_first = True
class OptimizerTime(Callback):
def __init__(self):
super().__init__()
self.total_mean_optimizer_step_time_ms: float = 0.0
self.total_epochs: int = 0
def on_train_epoch_end(self, trainer, pl_module):
if len(pl_module.optimizer_times_ms) == 0:
return
epoch_mean = sum(pl_module.optimizer_times_ms) / len(pl_module.optimizer_times_ms)
pl_module.log("mean_optimizer_step_time_ms", epoch_mean, on_step=False, on_epoch=True, sync_dist=True)
# Update the running mean
self.total_epochs += 1
self.total_mean_optimizer_step_time_ms = (
(self.total_mean_optimizer_step_time_ms * (self.total_epochs - 1)) + epoch_mean
) / self.total_epochs
# Reset the optimizer step times for the next epoch
pl_module.optimizer_times_ms = [] # type: ignore
def state_dict(self) -> dict[str, float | int]:
return {"running_mean": self.total_mean_optimizer_step_time_ms, "total_epochs": self.total_epochs}
def load_state_dict(self, state_dict: dict[str, float | int]):
self.total_mean_optimizer_step_time_ms = state_dict["running_mean"]
self.total_epochs = state_dict["total_epochs"] # type: ignore
class PrintEpochWithTime(Callback):
def __init__(self, active: bool = True):
super().__init__()
self.active: bool = active
self.time: dict[str, Optional[float]]
self.reset_time()
def reset_time(self):
self.time = {"train_start": None, "val_start": None, "val_end": None}
@rank_zero_only
def on_train_epoch_start(self, trainer: Trainer, pl_module: LightningModule):
if self.active:
self.time["train_start"] = time.time()
@rank_zero_only
def on_train_epoch_end(self, trainer: Trainer, pl_module: LightningModule):
# need to print here since train epoch ends after validation is done
if self.active and all(v is not None for v in self.time.values()):
max_epochs = pl_module.config.max_epochs
train_time = math.ceil(time.time() - self.time["train_start"]) # type: ignore
val_time = math.ceil(self.time["val_end"] - self.time["val_start"]) # type: ignore
log_info(
f"Finished training epoch {trainer.current_epoch + 1} of {max_epochs}. Time spent: training: {seconds_to_str(train_time - val_time)}, validation: {seconds_to_str(val_time)}, total: {seconds_to_str(train_time)}."
)
self.reset_time()
@rank_zero_only
def on_validation_epoch_start(self, trainer: Trainer, pl_module: LightningModule):
if self.active:
self.time["val_start"] = time.time()
@rank_zero_only
def on_validation_epoch_end(self, trainer: Trainer, pl_module: LightningModule):
if self.active:
self.time["val_end"] = time.time()
def metric_fn(metric: str, v: torch.Tensor, override: Optional[float] = None) -> float:
if override is not None:
return override
match metric:
case "mean":
return v.mean().item()
case "sum":
return v.sum().item()
case "abs_mean":
return v.abs().mean().item()
case "std":
return v.std().item()
case "abs_std":
return v.abs().std().item()
case "min":
return v.min().item()
case "max":
return v.max().item()
case "l1":
return vector_norm(v, ord=1).item()
case "l2":
return vector_norm(v, ord=2).item()
case "sq_mean":
return (v**2).mean().item()
case "sq_sum":
return (v**2).sum().item()
case _:
raise ValueError(f"unknown metric {metric}")
def add_metrics_to_stats(
stats: dict[str, float],
prefix: str,
name: str,
v: torch.Tensor,
metrics: Iterable[str],
override: Optional[float] = None,
):
for metric in metrics:
stats[f"{prefix}/{name}/{metric}"] = metric_fn(metric, v, override=override)
class LogTrainingStats(Callback):
def __init__(
self,
log_gradient: bool = True,
log_params: bool = True,
log_quantiles: bool = False,
log_momentum: bool = False,
log_lrs: bool = True,
log_every_n_steps: int = 50,
change_log_interval_every_n_steps: Optional[int] = None,
log_interval_factor: float = 2.0,
min_log_interval: int = 1,
max_log_interval: Optional[int] = None,
metrics: Iterable[str] = ("mean", "abs_mean", "std", "abs_std", "min", "max", "l1", "l2", "sq_mean"),
):
super().__init__()
self.log_gradient = log_gradient
self.log_params = log_params
self.log_quantiles = log_quantiles
self.log_momentum = log_momentum
self.log_lrs = log_lrs
self.log_every_n_steps = log_every_n_steps
self.change_log_interval_every_n_steps = change_log_interval_every_n_steps
self.log_interval_factor = log_interval_factor
self.min_log_interval = min_log_interval
self.max_log_interval = max_log_interval
self.metrics = metrics
def _check_and_adjust_log_interval(self, trainer: Trainer, pl_module: LightningModule):
if self.change_log_interval_every_n_steps is not None:
if trainer.global_step > 0 and trainer.global_step % self.change_log_interval_every_n_steps == 0:
self.log_every_n_steps = math.ceil(self.log_every_n_steps * self.log_interval_factor)
self.log_every_n_steps = max(self.log_every_n_steps, self.min_log_interval)
if self.max_log_interval is not None:
self.log_every_n_steps = min(self.log_every_n_steps, self.max_log_interval)
pl_module.log("logging_interval", self.log_every_n_steps)
return trainer.global_step % self.log_every_n_steps == 0
@rank_zero_only
def on_before_optimizer_step(self, trainer: Trainer, pl_module: LightningModule, optimizer: torch.optim.Optimizer):
if self._check_and_adjust_log_interval(trainer, pl_module):
stats = {}
q = torch.arange(0.25, 1, 0.25).round(decimals=2).to(trainer.model.device)
for param_group in optimizer.param_groups:
for name, param in zip(param_group["names"], param_group["params"]):
if self.log_params or self.log_lrs:
v_detached = param.detach()
if self.log_params:
if torch.isnan(v_detached).sum() > 0:
log_warn(f"# NaN in param {name}")
if torch.isinf(v_detached).sum() > 0:
log_warn(f"# Inf in param {name}")
add_metrics_to_stats(stats, "param", name, v_detached, self.metrics)
if self.log_quantiles and v_detached.size().numel() < 10000000:
deciles = torch.quantile(v_detached.float(), q, interpolation="linear")
for q_idx, d_val in enumerate(deciles):
stats[f"param/{name}/quantile-{q[q_idx]}"] = d_val.item()
if (self.log_gradient or self.log_lrs) and param.requires_grad:
if trainer.num_devices > 1:
grad_data = deepspeed.utils.safe_get_full_grad(param)
else:
grad_data = param.grad
else:
grad_data = None
if grad_data is not None:
if torch.isnan(grad_data).sum() > 0:
log_warn(f"# NaN in grad {name}")
if torch.isinf(grad_data).sum() > 0:
log_warn(f"# Inf in grad {name}")
if self.log_gradient:
if torch.isnan(grad_data).sum() > 0 or torch.isinf(grad_data).sum() > 0:
add_metrics_to_stats(stats, "grad", name, grad_data, self.metrics, override=-10.0)
if self.log_quantiles and grad_data.size().numel() < 10000000:
for q_idx, _ in enumerate(q):
stats[f"param/{name}/quantile-{q[q_idx]}"] = -10
stats[f"grad/{name}/mean"] = grad_data.mean().item()
if len(grad_data.shape) > 1 or grad_data.shape[0] > 1:
add_metrics_to_stats(stats, "grad", name, grad_data, self.metrics)
if self.log_quantiles and grad_data.size().numel() < 10000000:
deciles = torch.quantile(grad_data.float(), q, interpolation="linear")
for q_idx, d_val in enumerate(deciles):
stats[f"grad/{name}/quantile-{q[q_idx]}"] = d_val.item()
if self.log_lrs:
grad_norm = vector_norm(grad_data)
param_norm = vector_norm(v_detached)
effective_lr = (grad_norm / param_norm).item() if param_norm != 0 else 0.0
stats[f"param/{name}/effective_lr"] = effective_lr
if self.log_momentum or self.log_lrs:
if param in optimizer.state:
state = optimizer.state[param]
else:
state = {}
if self.log_momentum:
if "exp_avg" in state:
moment1 = state["exp_avg"]
elif "momentum_buffer" in state:
moment1 = state["momentum_buffer"]
else:
moment1 = None
if moment1 is not None:
add_metrics_to_stats(stats, "1st_order_momentum", name, moment1, self.metrics)
if "exp_avg_sq" in state:
add_metrics_to_stats(stats, "2nd_order_momentum", name, state["exp_avg_sq"], self.metrics)
if self.log_lrs and "lr" in state:
stats[f"param/{name}/lr"] = state["lr"].item()
if trainer.loggers is not None:
for logger in trainer.loggers:
logger.log_metrics(stats, step=trainer.global_step)

View file

@ -0,0 +1,156 @@
from pathlib import Path
from typing import Any, Literal, Optional
from .utils import AttributeDict, EndlessList, convert_type_inside_dict, maybe_abspath, some, wrap_list
class BaseConfig(AttributeDict):
def __init__(self, config: dict):
super().__init__(convert_type_inside_dict(config, dict, AttributeDict))
class NamedConfig(BaseConfig):
def __init__(
self,
config: dict[str, Any],
identifier_key: str = "name",
outdir_key: str = "output_dir_name"
) -> None:
super().__init__(config)
self.name = config[identifier_key]
self.output_dir_name = config.get(outdir_key, self.name)
class OptimizerConfig(NamedConfig):
def __init__(
self,
config: dict[str, Any],
optimizer_key: str,
task_key: str,
identifier_key: str = "name",
outdir_key: str = "output_dir_name"
) -> None:
cfg = dict(config[optimizer_key])
self.lr_interval: Literal["step", "epoch"] = cfg.get("lr_interval", "step")
self.max_steps: int = config[task_key].get("max_steps", None)
self.max_epochs: int = config[task_key]["max_epochs"]
cfg["max_steps"] = self.max_steps
cfg["max_epochs"] = self.max_epochs
super().__init__(cfg, identifier_key, outdir_key)
class TaskConfig(NamedConfig):
def __init__(
self,
config: dict[str, Any],
task_key: str,
engine_key: str,
identifier_key: str = "name",
outdir_key: str = "output_dir_name"
) -> None:
cfg = dict(config[task_key])
self.batch_size: int = cfg["batch_size"]
self.data_dir = Path(config[engine_key]["data_dir"]).resolve()
self.max_epochs: int = cfg["max_epochs"]
self.max_steps: int = cfg.get("max_steps", None)
self.target_metric: str = cfg["target_metric"]
self.target_metric_mode: str = cfg["target_metric_mode"]
self.workers = config[engine_key]["workers"]
cfg["data_dir"] = self.data_dir
cfg["workers"] = self.workers
super().__init__(cfg, identifier_key, outdir_key)
class EngineConfig(BaseConfig):
def __init__(self, config: dict[str, Any], task_key: str, engine_key: str) -> None:
cfg = dict(config[engine_key])
self.accelerator = cfg["accelerator"]
self.deterministic: bool | Literal["warn"] = cfg["deterministic"]
self.data_dir = Path(cfg["data_dir"]).resolve()
self.detect_anomaly: bool = cfg["detect_anomaly"]
self.devices: int = some(cfg["devices"], default=1)
self.early_stopping: Optional[int] = cfg["early_stopping"]
self.early_stopping_metric: str = some(cfg["early_stopping_metric"], default=config[task_key]["target_metric"])
self.gradient_clip_alg: str = cfg["gradient_clip_alg"]
self.gradient_clip_val: Optional[float] = cfg["gradient_clip_val"]
self.log_extra: bool | dict[str, bool] = cfg["log_extra"]
self.logging_inteval: int = cfg["logging_interval"]
self.max_steps: int = config[task_key].get("max_steps", None)
self.optimize_memory: bool = cfg["optimize_memory"]
self.output_dir = Path(cfg["output_dir"]).resolve()
self.plot: bool = cfg["plot"]
self.precision: str = cfg["precision"]
self.restrict_train_epochs: Optional[int] = cfg["restrict_train_epochs"]
_resume = cfg.get("resume", False)
self.resume: Optional[Path] | bool = Path(_resume).resolve() if isinstance(_resume, str) else _resume
self.run_scheduler: str = cfg["run_scheduler"]
self.seed: int = cfg["seed"]
self.seed_mode: str = cfg["seed_mode"]
self.save_sbatch_scripts: Optional[Path] = maybe_abspath(cfg["save_sbatch_scripts"])
self.sbatch_args: dict[str, str] = cfg["sbatch_args"]
self.sbatch_script_template: Optional[Path] = maybe_abspath(cfg["sbatch_script_template"])
self.sbatch_time_factor: float = cfg["sbatch_time_factor"]
self.slurm_log_dir: Optional[Path] = maybe_abspath(cfg["slurm_log_dir"])
self.silent: bool = cfg.get("silent", False)
self.test: bool = cfg.get("test", True)
self.train: bool = cfg.get("train", True)
self.validate: bool = cfg.get("validate", False)
self.workers: int = cfg["workers"]
cfg["data_dir"] = self.data_dir
cfg["devices"] = self.devices
cfg["early_stopping_metric"] = self.early_stopping_metric
cfg["max_steps"] = self.max_steps
cfg["output_dir"] = self.output_dir
cfg["resume"] = self.resume
cfg["slurm_log_dir"] = self.slurm_log_dir
cfg["save_sbatch_scripts"] = self.save_sbatch_scripts
cfg["sbatch_script_template"] = self.sbatch_script_template
super().__init__(cfg)
def outpath_relevant_engine_keys(self, prefix: str = "") -> list[str]:
keys = [
"accelerator",
"deterministic",
"detect_anomaly",
"devices",
"early_stopping",
"gradient_clip_alg",
"gradient_clip_val",
"optimize_memory",
"precision",
"seed"
]
return [f"{prefix}{k}" for k in keys]
def outpath_irrelevant_engine_keys(self, prefix: str = "") -> list[str]:
return [f"{prefix}{k}" for k in self.keys() if k not in self.outpath_relevant_engine_keys()]
class EvalConfig(BaseConfig):
def __init__(self, config: dict[str, Any], eval_key: str, engine_key: str, ignore_keys = None) -> None:
cfg = dict(config[eval_key])
self.experiment_files = AttributeDict(dict(
best_model = "results_best_model.json",
last_model = "results_final_model.json",
config = "config.yaml"
))
self.output_types: list[str] = wrap_list(cfg["output_types"])
experiment_dir = Path(config[engine_key]["output_dir"]).resolve()
self.output_dir: Path = some(maybe_abspath(cfg["output_dir"]), default=experiment_dir / "plots")
self.experiment_name: str = cfg["experiment_name"]
self.verbose: bool = cfg.get("verbose", False)
split = cfg.get("split_groups", False)
self.split_groups: bool | list[str] = split if isinstance(split, bool) else wrap_list(split)
self.checkpoints: list[Literal["last", "best"]] = wrap_list(cfg["checkpoints"])
self.column_split_key: Optional[str] = cfg.get("column_split_key", None)
self.column_split_order: Optional[list[str]] = cfg.get("column_split_order", None)
self.ignore_keys: list[str] = some(ignore_keys, default=[])
self.aggregate_groups: list[str] = wrap_list(cfg["aggregate_groups"])
cfg["ignore_keys"] = self.ignore_keys
cfg["output_types"] = self.output_types
cfg["output_dir"] = self.output_dir
cfg["aggregate_groups"] = self.aggregate_groups
cfg["output_types"] = self.output_types
cfg["plot"]["x_axis"] = EndlessList(wrap_list(cfg["plot"]["x_axis"]))
cfg["plot"]["y_axis"] = EndlessList(wrap_list(cfg["plot"]["y_axis"]))
cfg["split_groups"] = self.split_groups
super().__init__(cfg)

View file

@ -0,0 +1,41 @@
engine:
accelerator: gpu # Whether to train on cpu or gpu
check_finite: true # Check if 'early_stopping_metric' is finite during training. Aborts training if not. Only active when 'early_stopping' is not null.
data_dir: ./data # Where you want to store the training data
deterministic: warn # 'warn' tries to use deterministic algorithms if possible, also accepts true or false.
detect_anomaly: false # Lightning trainer argument with same name.
devices: null # This is set by each task by default, but can be overridden
early_stopping: null # The number of epochs to wait before stopping if no improvement is found. Set to null to disable.
early_stopping_metric: null # Metric to use for early stopping. If null, uses 'task.target_metric'.
gradient_clip_alg: norm # {value, norm} to disable gradient clipping: set 'gradient_clip_val' to null
gradient_clip_val: null # DEFAULT: don't clip gradients, expects value in [0, 1]
log_extra: false # Activate logging of gradients and more. Can be bool or a dict with the options supported by callback `LogTrainingStats` in `pytorch_fob/engine/callbacks.py`.
logging_interval: 50 # Number of steps between each logging step.
optimize_memory: false # Use nondeterministic, but memory-efficient algorithms for self-attention
output_dir: ./experiments # Where you want to store the results
plot: true # Whether to plot the results.
precision: bf16-mixed # Floating precision of training, see https://lightning.ai/docs/pytorch/stable/common/precision_basic.html
restrict_train_epochs: null # Only train for a specific number of epochs. Set to null to disable. The epochs set here are counted from start of training, so this works with 'resume'.
resume: true # You can either pass the path to your checkpoint here or set to true, which loads the last checkpoint.
run_scheduler: sequential # How to schedule the runs of the experiment. Supported values:
# 'sequential': runs are performed sequentially
# 'single:N' where N is the number of the run starting from 1.
# 'slurm_array': runs are scheduled using a SLURM array job.
# 'slurm_jobs': runs are scheduled using independent SLURM jobs
save_sbatch_scripts: null # Path to directory where sbatch scripts will be saved. If null, sbatch scripts will not be saved.
sbatch_time_factor: 1 # Time factor for SLURM. Multiplies all default times by this factor.
sbatch_args: # Additional arguments to pass to sbatch. Only used if run_scheduler is 'slurm_array'.
# ntasks-per-node and gres are set to 'devices' by default
# cpus-per-task is set to 'workers' by default
nodes: 1
mem-per-cpu: 2gb
time: 00:30:00 # Each task has their own default time (assumes A100 or similar gpu). Format: HH:MM:SS or seconds.
sbatch_script_template: null # Path to template for the sbatch script. Script can contain placeholder '__FOB_COMMAND__'. Otherwise it will be executed before the experiment. 'sbatch_args' will be added to the beginning of the script.
slurm_log_dir: null # Default: 'output_dir/slurm_logs' for run_scheduler 'slurm_array' and 'run_dir/slurm_logs' for run_scheduler 'slurm_jobs'
seed: 42 # The seed to use for the experiment
seed_mode: fixed # Currently only supports 'fixed'
silent: false # whether to hide progress bars. Recommended when writing outputs to a log file.
test: true # Whether to test the model.
train: true # Whether to train the model.
validate: false # Whether to validate the model after training (only useful if you are interested in the results, for example for HPO).
workers: 16 # The number of processes to use for dataloading

View file

@ -0,0 +1,228 @@
import json
from copy import deepcopy
from typing import Any, Callable, Iterable, Iterator, Literal, Optional
from pathlib import Path
from matplotlib.figure import Figure
from pandas import DataFrame, concat, json_normalize
from pytorch_fob.engine.configs import EvalConfig
from pytorch_fob.engine.grid_search import grid_search
from pytorch_fob.engine.parser import YAMLParser
from pytorch_fob.engine.run import Run
from pytorch_fob.engine.run_schedulers import sequential, slurm_array, slurm_jobs
from pytorch_fob.engine.utils import log_debug, log_info, log_warn, some, sort_dict_recursively
from pytorch_fob.evaluation import evaluation_path
from pytorch_fob.evaluation.plot import create_figure, get_output_file_path, save_files, set_plotstyle
from pytorch_fob.optimizers import lr_schedulers_path, optimizer_path, optimizer_names
from pytorch_fob.tasks import task_path, task_names
def engine_path() -> Path:
return Path(__file__).resolve().parent
class Engine():
def __init__(self) -> None:
self._runs = []
self._defaults = []
self._experiment = {}
self._experiment_file = None
self._block_plotting = False
self.task_key = "task"
self.optimizer_key = "optimizer"
self.engine_key = "engine"
self.eval_key = "evaluation"
self.identifier_key = "name"
self.default_file_name = "default.yaml"
self.parser = YAMLParser()
def run_experiment(self) -> Optional[list[int]]:
assert len(self._runs) > 0, "No runs in experiment, make sure to call 'parse_experiment' first."
scheduler = self._runs[0][self.engine_key]["run_scheduler"]
assert all(map(lambda x: x[self.engine_key]["run_scheduler"] == scheduler, self._runs)), \
"You cannot perform gridsearch on 'run_scheduler'."
if scheduler == "sequential":
sequential(self.runs(), len(self._runs), self._experiment)
elif scheduler.startswith("single"):
n = int(scheduler.rsplit(":", 1)[-1])
log_info(f"Starting run {n}/{len(self._runs)}.")
run = self._make_run(n)
run.start()
elif scheduler == "slurm_array":
self._block_plotting = True
slurm_array(list(self.runs()), self._experiment)
elif scheduler == "slurm_jobs":
self._block_plotting = True
return slurm_jobs(list(self.runs()), self._experiment)
else:
raise ValueError(f"Unsupported run_scheduler: {scheduler=}.")
def parse_experiment_from_file(self, file: Path, extra_args: Iterable[str] = tuple()):
self._experiment_file = file.resolve()
searchspace: dict[str, Any] = self.parser.parse_yaml(self._experiment_file)
self.parse_experiment(searchspace, extra_args)
def parse_experiment(self, searchspace: dict[str, Any], extra_args: Iterable[str] = tuple()):
self.parser.parse_args_into_searchspace(searchspace, extra_args)
# normalize experiment
self._named_dicts_to_list(
searchspace,
[self.optimizer_key, self.task_key],
[optimizer_names(), task_names()]
)
searchspace = sort_dict_recursively(searchspace)
self._experiment = deepcopy(searchspace)
# exclude plotting from gridsearch
if self.eval_key in searchspace:
eval_config = searchspace.pop(self.eval_key)
else:
eval_config = {}
log_debug("Performing gridsearch...")
self._runs = grid_search(searchspace)
log_debug(f"Found {len(self._runs)} runs.")
for run in self._runs:
run[self.eval_key] = eval_config
self._fill_runs_from_default(self._runs)
self._fill_defaults()
def runs(self) -> Iterator[Run]:
"""
Creates and initializes runs from parsed run config.
"""
for n, _ in enumerate(self._runs, start=1):
yield self._make_run(n)
def prepare_data(self):
prepared = set()
for n, t in enumerate(self._runs, start=1):
name = t["task"]["name"]
if name not in prepared:
run = self._make_run(n)
log_info(f"Setting up data for {run.task_key} '{run.task.name}'...")
run.get_datamodule().prepare_data()
log_info("... finished.")
prepared.add(name)
def plot(self, save: bool = True) -> list[Figure]:
run = next(self.runs())
if self._block_plotting or not run.engine.plot:
return []
config = run.evaluation
set_plotstyle(config)
figs = []
for mode in config.checkpoints:
df = self.dataframe_from_runs(mode)
if config.plot.single_file:
fig, dfs = self.plot_one_fig(df, config)
if save:
self.save_one_plot(fig, dfs, config, mode)
figs.append(fig)
else:
# TODO: option to split into multiple files
raise NotImplementedError("evaluation.plot.single_file=False is not implemented yet.")
return figs
def plot_one_fig(self, df: DataFrame, config: EvalConfig):
if config.column_split_key is None:
dfs = [df]
else:
groups = df.groupby(config.column_split_key)
order = some(config.column_split_order, default=map(lambda x: x[0], sorted(groups)))
dfs: list[DataFrame] = [groups.get_group(group_name) for group_name in order]
fig, _ = create_figure(dfs, config)
return fig, dfs
def save_one_plot(self, fig, dfs: list[DataFrame], config: EvalConfig, mode: Literal["last", "best"]):
output_file_path = get_output_file_path(dfs, config, suffix=mode)
save_files(fig, dfs, output_file_path, config)
def dataframe_from_runs(self, mode: Literal["last", "best"]) -> DataFrame:
dfs: list[DataFrame] = []
for run in self.runs():
df = json_normalize(run.get_config())
if mode == "last":
result_file = run.run_dir / run.evaluation.experiment_files.last_model
elif mode == "best":
result_file = run.run_dir / run.evaluation.experiment_files.best_model
else:
raise ValueError(f"mode {mode} not supported")
if not result_file.is_file():
log_warn(f"result file {result_file} not found, skipping this hyperparameter setting")
continue
metric = run.evaluation.plot.metric
with open(result_file, "r", encoding="utf8") as f:
content = json.load(f)
if metric in content[0]:
df.at[0, metric] = content[0][metric]
else:
log_warn(f"could not find value for {metric} in json, skipping this hyperparameter setting")
continue
dfs.append(df)
if len(dfs) == 0:
raise ValueError("no dataframes found, check your config")
return concat(dfs, sort=False)
def _make_run(self, n: int) -> Run:
"""
n: number of the run, starting from 1
setup: download and prepare data
"""
i = n - 1
return Run(
self._runs[i],
self._defaults[i],
self.task_key,
self.optimizer_key,
self.engine_key,
self.eval_key,
self.identifier_key
)
def _named_dicts_to_list(self, searchspace: dict[str, Any], keys: list[str], valid_options: list[list[str]]):
assert len(keys) == len(valid_options)
for key, opts in zip(keys, valid_options):
if key not in searchspace:
continue
if isinstance(searchspace[key], dict) and all(name in opts for name in searchspace[key]):
searchspace[key] = [cfg | {self.identifier_key: name} for name, cfg in searchspace[key].items()]
def _fill_defaults(self):
self._defaults = []
for run in self._runs:
default_cfg = {
k: {self.identifier_key: run[k][self.identifier_key]}
for k in [self.task_key, self.optimizer_key]
}
self._defaults.append(default_cfg)
self._fill_runs_from_default(self._defaults)
def _fill_runs_from_default(self, runs: list[dict[str, Any]]):
for i, _ in enumerate(runs):
# order from higher to lower in hierarchy
runs[i] = self._fill_named_from_default(runs[i], self.task_key, task_path)
runs[i] = self._fill_named_from_default(runs[i], self.optimizer_key, optimizer_path)
runs[i] = self._fill_unnamed_from_default(runs[i], lr_schedulers_path)
runs[i] = self._fill_unnamed_from_default(runs[i], engine_path)
runs[i] = self._fill_unnamed_from_default(runs[i], evaluation_path)
def _fill_unnamed_from_default(self, experiment: dict[str, Any], unnamed_root: Callable) -> dict[str, Any]:
default_path: Path = unnamed_root() / self.default_file_name
default_config = self.parser.parse_yaml(default_path)
self.parser.merge_dicts_hierarchical(default_config, experiment)
return default_config
def _fill_named_from_default(self, experiment: dict[str, Any], key: str, named_root: Callable) -> dict[str, Any]:
self._argcheck_named(experiment, key, self.identifier_key)
named = experiment[key]
if isinstance(named, dict):
named = named[self.identifier_key]
else:
experiment[key] = {self.identifier_key: named}
default_path: Path = named_root(named) / self.default_file_name
default_config = self.parser.parse_yaml(default_path)
self.parser.merge_dicts_hierarchical(default_config, experiment)
return default_config
def _argcheck_named(self, experiment: dict[str, Any], key: str, identifier: str):
assert key in experiment, f"You did not provide any {key}."
assert isinstance(experiment[key], str) or identifier in experiment[key], \
f"Unknown {key}, either specify only a string or provide a key '{identifier}'"

View file

@ -0,0 +1,32 @@
from typing import Any
def unique(xs: list) -> list:
"""Returns deduplicated list"""
res = []
for x in xs:
if x not in res:
res.append(x)
return res
def grid_search(d: dict[str, Any]) -> list[dict[str, Any]]:
ret = []
if isinstance(d, dict):
if len(d) == 0:
return [dict()]
copy = d.copy()
k, v = copy.popitem()
configs = unique(grid_search(v))
rest = grid_search(copy)
for r in rest:
for config in configs:
ret.append(r | {k: config})
elif isinstance(d, list):
for v in d:
configs = grid_search(v)
for config in configs:
ret.append(config)
else:
ret.append(d)
return ret

View file

@ -0,0 +1,226 @@
from dataclasses import dataclass, field
from typing import Any, Callable, Iterable, Optional
from torch import nn
from torch.nn import Module
from torch.nn.parameter import Parameter
from pytorch_fob.engine.utils import some, log_warn
@dataclass
class ParameterGroup():
named_parameters: dict[str, Parameter]
lr_multiplier: Optional[float] = field(default=None)
weight_decay_multiplier: Optional[float] = field(default=None)
optimizer_kwargs: dict[str, Any] = field(default_factory=dict)
def __and__(self, other) -> "ParameterGroup":
assert isinstance(other, ParameterGroup)
n1 = set(self.named_parameters.keys())
n2 = set(other.named_parameters.keys())
all_params = self.named_parameters | other.named_parameters
n12 = n1 & n2
new_params = {n: all_params[n] for n in n12}
return ParameterGroup(
named_parameters=new_params,
lr_multiplier=some(other.lr_multiplier, default=self.lr_multiplier),
weight_decay_multiplier=some(other.weight_decay_multiplier, default=self.weight_decay_multiplier),
optimizer_kwargs=self.optimizer_kwargs | other.optimizer_kwargs
)
def __len__(self) -> int:
return len(self.named_parameters)
def __bool__(self) -> bool:
return not self.empty()
def empty(self) -> bool:
return len(self.named_parameters) == 0
def to_optimizer_dict(
self,
lr: Optional[float] = None,
weight_decay: Optional[float] = None
) -> dict[str, list[Parameter] | Any]:
names = sorted(self.named_parameters)
d = {
"params": [self.named_parameters[n] for n in names],
"names": names,
**self.optimizer_kwargs
}
if lr is not None:
d["lr"] = self.lr_multiplier * lr if self.lr_multiplier is not None else lr
if weight_decay is not None:
d["weight_decay"] = self.weight_decay_multiplier * weight_decay \
if self.weight_decay_multiplier is not None else weight_decay
return d
class GroupedModel(Module):
"""
Wrapper around a nn.Module to allow specifying different optimizer settings for different parameters.
To use this feature for your task, inherit from this class and override the `parameter_groups` method.
Then simply wrap your model before passing it to the `__init__` method of the `TaskModel` superclass.
"""
def __init__(self, model: Module) -> None:
super().__init__()
self.model = model
def forward(self, *args, **kwargs):
return self.model.forward(*args, **kwargs)
def parameter_groups(self) -> list[ParameterGroup]:
return wd_group_named_parameters(self.model)
def grouped_parameters(
self,
lr: Optional[float] = None,
weight_decay: Optional[float] = None
) -> list[dict[str, list[Parameter] | Any]]:
return [pg.to_optimizer_dict(lr, weight_decay) for pg in self.parameter_groups()]
def merge_parameter_splits(split1: list[ParameterGroup], split2: list[ParameterGroup]) -> list[ParameterGroup]:
"""
Merge two lists of ParameterGroup objects into a single list.
Assumes that both input lists partition the parameters.
"""
groups = []
for pg1 in split1:
for pg2 in split2:
pg12 = pg1 & pg2
if not pg12.empty():
groups.append(pg12)
return groups
def group_named_parameters(
model: Module,
g1_conds: Iterable[Callable] = (lambda *_: True,),
g2_conds: Iterable[Callable] = (lambda *_: True,),
special_conds: Iterable[Callable] = tuple(),
ignore_conds: Iterable[Callable] = tuple(),
g1_kwargs: Optional[dict[str, Any]] = None,
g2_kwargs: Optional[dict[str, Any]] = None,
debug: bool = False
) -> list[ParameterGroup]:
"""
Group named parameters based on specified conditions and return a list of ParameterGroup objects.
Args:
model (Module): The neural network model.
g1_conds (Iterable[Callable]): Conditions for selecting parameters for group 1.
g2_conds (Iterable[Callable]): Conditions for selecting parameters for group 2.
special_conds (Iterable[Callable]): Conditions for selecting special parameters that should not be grouped.
ignore_conds (Iterable[Callable]): Conditions for ignoring parameters (e.g. if they occur in submodules).
g1_kwargs (Optional[dict[str, Any]]): Additional keyword arguments for constructor of group 1.
g2_kwargs (Optional[dict[str, Any]]): Additional keyword arguments for constructor of group 2.
Returns:
List[ParameterGroup]: A list of ParameterGroup objects containing named parameters.
"""
g1_kwargs = g1_kwargs if g1_kwargs is not None else {}
g2_kwargs = g2_kwargs if g2_kwargs is not None else {}
s1 = set()
s2 = set()
special = set()
param_dict = {pn: p for pn, p in model.named_parameters() if p.requires_grad}
for mn, m in model.named_modules():
for pn, p in m.named_parameters():
fpn = f"{mn}.{pn}" if mn else pn # full param name
if not p.requires_grad or fpn not in param_dict:
continue # frozen weights
elif any(c(m, p, fpn) for c in ignore_conds):
continue
elif any(c(m, p, fpn) for c in special_conds):
special.add(fpn)
elif any(c(m, p, fpn) for c in g1_conds):
s1.add(fpn)
elif any(c(m, p, fpn) for c in g2_conds):
s2.add(fpn)
elif debug:
log_warn("group_named_parameters: Not using any rule for ", fpn, " in ", type(m))
s1 |= (param_dict.keys() - s2 - special)
# validate that we considered every parameter
inter_params = s1 & s2
union_params = s1 | s2
assert len(inter_params) == 0, f"Parameters {str(inter_params)} made it into both s1/s2 sets!"
assert len(
param_dict.keys() - special - union_params) == 0, \
f"parameters {str(param_dict.keys() - union_params)} \
were not separated into either s1/s2 set!"
if not s2:
param_groups = [ParameterGroup(
named_parameters=dict(zip(sorted(union_params), (param_dict[pn] for pn in sorted(union_params))))
)]
else:
param_groups = [
ParameterGroup(
named_parameters=dict(zip(sorted(s1), (param_dict[pn] for pn in sorted(s1)))),
**g1_kwargs
),
ParameterGroup(
named_parameters=dict(zip(sorted(s2), (param_dict[pn] for pn in sorted(s2)))),
**g2_kwargs
),
]
return param_groups
def wd_group_named_parameters(model: Module) -> list[ParameterGroup]:
whitelist_weight_modules = (nn.Linear, nn.modules.conv._ConvNd) # pylint: disable=protected-access # noqa
blacklist_weight_modules = (nn.modules.batchnorm._NormBase, # pylint: disable=protected-access # noqa
nn.GroupNorm, nn.LayerNorm,
nn.LocalResponseNorm,
nn.Embedding)
ignore_modules = (nn.Sequential,)
apply_decay_conds = [lambda m, _, pn: pn.endswith('weight') and isinstance(m, whitelist_weight_modules)]
apply_no_decay_conds = [lambda m, _, pn: pn.endswith('bias') or isinstance(m, blacklist_weight_modules)]
special_conds = [lambda m, p, pn: hasattr(p, '_optim')]
ignore_conds = [lambda m, p, pn: isinstance(m, ignore_modules)]
return group_named_parameters(
model,
g1_conds=apply_decay_conds,
g2_conds=apply_no_decay_conds,
special_conds=special_conds,
ignore_conds=ignore_conds,
g2_kwargs={'weight_decay_multiplier': 0.0}
)
def resolve_parameter_dicts(dict1: dict[str, Any], dict2: dict[str, Any]) -> list[dict[str, Any]]:
p1, p2 = dict1["params"], dict2["params"]
n1, n2 = set(dict1["names"]), set(dict2["names"])
n_to_p1 = dict(zip(dict1["names"], dict1["params"]))
n_to_p2 = dict(zip(dict2["names"], dict2["params"]))
assert len(n1) == len(p1)
assert len(n2) == len(p2)
kwarg1 = {k: v for k, v in dict1.items() if k not in ["params", "names"]}
kwarg2 = {k: v for k, v in dict2.items() if k not in ["params", "names"]}
n1_and_n2 = n1 & n2
n1_no_n2 = n1 - n2
n2_no_n1 = n2 - n1
assert n1_and_n2 | n1_no_n2 | n2_no_n1 == n1 | n2
outdict1 = {"params": [n_to_p1[n] for n in sorted(n1_no_n2)],
"names": sorted(n1_no_n2), **kwarg1}
outdict2 = {"params": [n_to_p2[n] for n in sorted(n2_no_n1)],
"names": sorted(n2_no_n1), **kwarg2}
# kwarg2 takes precedence if an arg is present in both dicts:
outdict12 = {"params": [{**n_to_p1, **n_to_p2}[n] for n in sorted(n1_and_n2)],
"names": sorted(n1_and_n2), **kwarg1, **kwarg2}
return [outdict1, outdict2, outdict12]
def intersect_parameter_dicts(dict1: dict[str, Any], dict2: dict[str, Any]) -> Optional[dict[str, Any]]:
d = resolve_parameter_dicts(dict1, dict2)[2]
return d if len(d["params"]) > 0 else None
def merge_parameter_dicts(dict1: dict[str, Any], dict2: dict[str, Any]) -> list[dict[str, Any]]:
d = resolve_parameter_dicts(dict1, dict2)
return list(filter(lambda x: len(x["params"]) > 0, d))

View file

@ -0,0 +1,65 @@
from pathlib import Path
from typing import Any, Iterable, Optional
import re
import yaml
class YAMLParser():
def __init__(self) -> None:
pass
def parse_yaml(self, file: Path) -> Any:
"""
Opens and parses a YAML file.
"""
with open(file, "r", encoding="utf8") as f:
return yaml.safe_load(f)
def parse_yamls_and_extra_args(self,
default_yaml: Path,
custom_yaml: Optional[Path],
additional_args: Iterable[str] = tuple()
) -> dict:
"""assumes that there is a dict in the yaml"""
config_to_use = self.parse_yaml(default_yaml)
if custom_yaml is not None:
user_yaml = self.parse_yaml(custom_yaml)
# merge in place
self.merge_dicts_hierarchical(lo=config_to_use, hi=user_yaml)
self.parse_args_into_searchspace(config_to_use, additional_args)
return config_to_use
def parse_args_into_searchspace(self, searchspace: dict[str, Any], args: Iterable[str]):
"""
Overwrites args given in the form of 'this.that=something'. Also supports lists: 'this.that[0]=something'
"""
for arg in args:
self._parse_arg_into_searchspace(searchspace, arg)
def _parse_arg_into_searchspace(self, searchspace: dict[str, Any], arg: str):
keys, value = arg.split("=")
keys = keys.split(".")
keys_with_list_indices = []
for key in keys:
match = re.search(r"^(.*?)\[(\-?\d+)\]$", key)
if match:
keys_with_list_indices.append(match.group(1))
keys_with_list_indices.append(int(match.group(2)))
else:
keys_with_list_indices.append(key)
target = searchspace
for key in keys_with_list_indices[:-1]:
if isinstance(target, dict) and key not in target:
target[key] = {}
target = target[key]
target[keys_with_list_indices[-1]] = yaml.safe_load(value)
def merge_dicts_hierarchical(self, lo: dict, hi: dict):
"""
Overwrites values in `lo` with values from `hi` if they are present in both/
"""
for k, v in hi.items():
if isinstance(v, dict) and isinstance(lo.get(k, None), dict):
self.merge_dicts_hierarchical(lo[k], v)
else:
lo[k] = v

View file

@ -0,0 +1,298 @@
import hashlib
from pathlib import Path
import time
from typing import Any, Optional
from lightning import Callback, LightningDataModule, LightningModule, Trainer, seed_everything
from lightning.pytorch.callbacks import EarlyStopping, LearningRateMonitor, ModelCheckpoint
from lightning.pytorch.loggers import Logger, TensorBoardLogger, CSVLogger
from lightning.pytorch.utilities.types import _EVALUATE_OUTPUT
import torch
import yaml
from pytorch_fob.engine.callbacks import LogTrainingStats, OptimizerTime, PrintEpochWithTime, RestrictTrainEpochs
from pytorch_fob.engine.configs import EngineConfig, EvalConfig, OptimizerConfig, TaskConfig
from pytorch_fob.engine.utils import AttributeDict, EndlessList, calculate_steps, concatenate_dict_keys, convert_type_inside_dict, dict_differences, findfirst, path_to_str_inside_dict, precision_with_fallback, seconds_to_str, trainer_strategy, write_results, log_warn, log_info
from pytorch_fob.optimizers.optimizers import Optimizer
from pytorch_fob.tasks.tasks import TaskDataModule, TaskModel, import_task
class Run():
def __init__(
self,
config: dict[str, Any],
default_config: dict[str, Any],
task_key: str,
optimizer_key: str,
engine_key: str,
eval_key: str,
identifier_key: str
) -> None:
"""
setup: download and prepare data before creating the Run
"""
self._config = config
self._default_config = default_config
self.task_key = task_key
self.optimizer_key = optimizer_key
self.engine_key = engine_key
self.eval_key = eval_key
self.identifier_key = identifier_key
self._generate_configs()
self._set_outpath()
self._callbacks = AttributeDict({})
def start(self) -> dict[str, _EVALUATE_OUTPUT]:
self.run_dir.mkdir(parents=True, exist_ok=True)
self.export_config()
scores: dict[str, _EVALUATE_OUTPUT] = {}
if any([self.engine.train, self.engine.test]):
self._ensure_resume_path()
self.ensure_max_steps()
torch.set_float32_matmul_precision('high')
seed_everything(self.engine.seed, workers=True)
model, data_module = self.get_task()
if self.engine.train:
trainer = self.get_trainer()
self._train(trainer, model, data_module)
scores["mean_optimizer_time_ms"] = self._callbacks["optimizer_time"].total_mean_optimizer_step_time_ms
if self.engine.validate:
scores["validation"] = self._validate(trainer, model, data_module)
if self.engine.test:
tester = self.get_tester()
if self.engine.train: # no need to load last checkpoint, model is already loaded
ckpt = None
elif self.engine.resume is not None:
ckpt=self.engine.resume
else:
log_warn(
"No last checkpoint found, evaluating untrained model. " + \
"If this is unexpected, try to set 'engine.resume=true'."
)
ckpt = None
scores["test_final"] = self._test(tester, model, data_module, ckpt=ckpt) # type: ignore (see ensure_resume_path)
best_path = self.get_best_checkpoint()
if best_path is not None:
scores["test_best"] = self._test(tester, model, data_module, Path(best_path))
else:
log_info("No best checkpoint found, skipping test.")
write_results(scores, self.run_dir / "scores.json")
return scores
def _train(self, trainer: Trainer, model: LightningModule, data_module: LightningDataModule):
start_time = time.time()
if self.engine.accelerator == "gpu" and torch.cuda.is_available():
with torch.backends.cuda.sdp_kernel(
enable_flash=True,
enable_math=True,
enable_mem_efficient=(self.engine.optimize_memory or not self.engine.deterministic)
):
trainer.fit(model, datamodule=data_module, ckpt_path=self.engine.resume) # type: ignore
else:
trainer.fit(model, datamodule=data_module, ckpt_path=self.engine.resume) # type: ignore
end_time = time.time()
train_time = int(end_time - start_time)
log_info(f"Finished training in {seconds_to_str(train_time)}.")
# Write train_time.txt
train_time_path = self.run_dir / "train_time.txt"
with open(train_time_path, "w") as f:
f.write(str(train_time) + "\n")
def _validate(self, trainer: Trainer, model: LightningModule, data_module: LightningDataModule) -> _EVALUATE_OUTPUT:
score = trainer.validate(model, datamodule=data_module)
return score
def _test(self, tester: Trainer, model: LightningModule, data_module: LightningDataModule, ckpt: Optional[Path] = None) -> _EVALUATE_OUTPUT:
ckpt_path = self.engine.resume if ckpt is None else ckpt
mode = "final" if ckpt_path is None or ckpt_path.stem.startswith("last") else "best" # type: ignore
log_info(f"Testing {mode} checkpoint...")
score = tester.test(model, datamodule=data_module, ckpt_path=ckpt_path) # type: ignore
write_results(score, self.run_dir / f"results_{mode}_model.json")
return score
def export_config(self):
with open(self.run_dir / "config.yaml", "w", encoding="utf8") as f:
d = path_to_str_inside_dict(self._config)
d = convert_type_inside_dict(d, EndlessList, list)
yaml.safe_dump(d, f)
def export_config_dict(self) -> dict[str, Any]:
d = path_to_str_inside_dict(self._config)
d = convert_type_inside_dict(d, EndlessList, list)
return d
def get_config(self) -> AttributeDict:
return AttributeDict(self._config)
def get_optimizer(self) -> Optimizer:
return Optimizer(self.optimizer)
def get_task(self) -> tuple[TaskModel, TaskDataModule]:
task_module = import_task(self.task.name)
return task_module.get_task(self.get_optimizer(), self.task)
def get_datamodule(self) -> TaskDataModule:
task_module = import_task(self.task.name)
return task_module.get_datamodule(self.task)
def get_callbacks(self) -> list[Callback]:
if len(self._callbacks) < 1:
self._init_callbacks()
return list(self._callbacks.values())
def get_loggers(self) -> list[Logger]:
return [
TensorBoardLogger(
save_dir=self.run_dir,
name="tb_logs"
),
CSVLogger(
save_dir=self.run_dir,
name="csv_logs"
)
]
def get_trainer(self) -> Trainer:
return Trainer(
max_steps=self.engine.max_steps,
logger=self.get_loggers(),
callbacks=self.get_callbacks(),
devices=self.engine.devices,
strategy=trainer_strategy(self.engine.devices),
enable_progress_bar=(not self.engine.silent),
deterministic=self.engine.deterministic,
detect_anomaly=self.engine.detect_anomaly,
gradient_clip_val=self.engine.gradient_clip_val,
gradient_clip_algorithm=self.engine.gradient_clip_alg,
precision=precision_with_fallback(self.engine.precision), # type: ignore
accelerator=self.engine.accelerator,
log_every_n_steps=self.engine.logging_inteval
)
def get_tester(self) -> Trainer:
return Trainer(
devices=1,
logger=False,
enable_progress_bar=(not self.engine.silent),
deterministic=self.engine.deterministic,
precision=precision_with_fallback(self.engine.precision), # type: ignore
accelerator=self.engine.accelerator
)
def get_best_checkpoint(self) -> Optional[Path]:
model_checkpoint = self._callbacks.get("best_model_checkpoint", None)
if model_checkpoint is not None:
model_checkpoint = Path(model_checkpoint.best_model_path)
model_checkpoint = model_checkpoint if not model_checkpoint.is_dir() else None
if model_checkpoint is None:
available_checkpoints = self.get_available_checkpoints()
model_checkpoint = findfirst(lambda x: x.stem.startswith("best"), available_checkpoints)
return model_checkpoint
def get_available_checkpoints(self) -> list[Path]:
if self.checkpoint_dir.exists():
return list(filter(lambda x: x.suffix == ".ckpt", self.checkpoint_dir.iterdir()))
return []
def ensure_max_steps(self):
"""
Ensures that `self.task.max_steps` is calculated and set correctly.
"""
if self.task.max_steps is None:
max_steps = self._calc_max_steps()
self._config[self.task_key]["max_steps"] = max_steps
if self._default_config[self.task_key]["max_steps"] is None:
self._default_config[self.task_key]["max_steps"] = max_steps
self._generate_configs()
log_info(f"'max_steps' not set explicitly, using {max_steps=} (calculated from " +
f"max_epochs={self.task.max_epochs}, batch_size={self.task.batch_size}, devices={self.engine.devices})")
def _ensure_resume_path(self):
"""
Ensures that `self.engine.resume` is either a valid Path or None.
"""
if isinstance(self.engine.resume, Path):
pass
elif isinstance(self.engine.resume, bool):
resume_path = None
if self.engine.resume:
available_checkpoints = self.get_available_checkpoints()
if len(available_checkpoints) < 1:
log_warn("engine.resume=True but no checkpoint was found. Starting run from scratch.")
else:
resume_path = findfirst(lambda x: x.stem == "last", available_checkpoints)
self._config[self.engine_key]["resume"] = resume_path
self._generate_configs()
else:
raise TypeError(f"Unsupportet type for 'resume', got {type(self.engine.resume)=}.")
def _calc_max_steps(self) -> int:
dm = self.get_datamodule()
dm.setup("fit")
train_samples = len(dm.data_train)
return calculate_steps(self.task.max_epochs, train_samples, self.engine.devices, self.task.batch_size)
def _init_callbacks(self):
self._callbacks["optimizer_time"] = OptimizerTime()
self._callbacks["best_model_checkpoint"] = ModelCheckpoint(
dirpath=self.checkpoint_dir,
filename="best-{epoch}-{step}",
monitor=self.task.target_metric,
mode=self.task.target_metric_mode
)
self._callbacks["model_checkpoint"] = ModelCheckpoint(
dirpath=self.checkpoint_dir,
enable_version_counter=False,
every_n_epochs=1,
save_last=True
)
if self.engine.early_stopping is not None:
self._callbacks["early_stopping"] = EarlyStopping(
monitor=self.engine.early_stopping_metric,
mode=self.task.target_metric_mode,
patience=self.engine.early_stopping,
check_finite=self.engine.check_finite,
log_rank_zero_only=True
)
self._callbacks["lr_monitor"] = LearningRateMonitor(
logging_interval=self.optimizer.lr_interval
)
if self.engine.log_extra:
self._callbacks["extra"] = LogTrainingStats(
log_every_n_steps=self.engine.logging_inteval,
**(self.engine.log_extra if isinstance(self.engine.log_extra, dict) else {})
)
self._callbacks["print_epoch"] = PrintEpochWithTime(self.engine.silent)
if self.engine.restrict_train_epochs is not None:
self._callbacks["restrict_train_epochs"] = RestrictTrainEpochs(self.engine.restrict_train_epochs)
# TODO: callback for logging time per step
def outpath_exclude_keys(self) -> list[str]:
return [
self.eval_key,
"output_dir_name"
]
def _set_outpath(self):
base: Path = self.engine.output_dir / self.task.output_dir_name / self.optimizer.output_dir_name
exclude_keys = self.outpath_exclude_keys()
exclude_keys += self.engine.outpath_irrelevant_engine_keys()
diffs = concatenate_dict_keys(dict_differences(self._config, self._default_config), exclude_keys=exclude_keys)
run_dir = ",".join(f"{k}={str(v)}" for k, v in sorted(diffs.items())) if diffs else "default"
if len(run_dir) > 254: # max file name length
hashdir = hashlib.md5(run_dir.encode()).hexdigest()
log_info(f"folder name {run_dir} is too long, using {hashdir} instead.")
run_dir = hashdir
self.run_dir = base / run_dir
self.checkpoint_dir = self.run_dir / "checkpoints"
def _generate_configs(self):
self.engine = EngineConfig(self._config, self.task_key, self.engine_key)
self.optimizer = OptimizerConfig(self._config, self.optimizer_key, self.task_key, self.identifier_key)
self.task = TaskConfig(self._config, self.task_key, self.engine_key, self.identifier_key)
self.evaluation = EvalConfig(
self._config,
eval_key=self.eval_key,
engine_key=self.engine_key,
ignore_keys=self.engine.outpath_irrelevant_engine_keys(prefix=f"{self.engine_key}.") + [f"{self.optimizer_key}.output_dir_name", f"{self.task_key}.output_dir_name"]
)

View file

@ -0,0 +1,171 @@
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import Any, Iterable, Optional, Sequence
import traceback
import yaml
from pytorch_fob.engine.run import Run
from pytorch_fob.engine.slurm import Slurm
from pytorch_fob.engine.utils import log_info, log_warn, seconds_to_str, some, str_to_seconds
FOB_RUN_SCRIPT = "pytorch_fob.run_experiment"
FOB_EVAL_SCRIPT = "pytorch_fob.evaluate_experiment"
def argcheck_allequal_engine(
runs: list[Run],
keys: list[str],
reason: str = "'engine.run_scheduler=slurm_array'"
) -> None:
ok = True
first = runs[0]
for key in keys:
if not all(run.engine[key] == first.engine[key] for run in runs[1:]):
ok = False
break
if not ok:
req = ", ".join(map(lambda s: "engine." + s, keys))
raise ValueError(f"All runs must have the same values for {req} when using {reason}")
def export_experiment(run: Run, experiment: dict[str, Any]) -> Path:
run.run_dir.mkdir(parents=True, exist_ok=True)
outfile = run.run_dir / "experiment.yaml"
with open(outfile, "w", encoding="utf8") as f:
yaml.safe_dump(experiment, f)
return outfile
def process_args(args: dict[str, str], run: Run) -> None:
if "time" in args:
time = args["time"]
seconds = str_to_seconds(time) if isinstance(time, str) else time
args["time"] = seconds_to_str(int(run.engine.sbatch_time_factor * seconds))
if "gres" not in args and "gpus" not in args:
args["gres"] = f"gpu:{run.engine.devices}"
if not any(k.startswith("ntasks") for k in args):
args["ntasks-per-node"] = str(run.engine.devices)
if not any(k.startswith("cpus") for k in args):
args["cpus-per-task"] = str(run.engine.workers)
def wrap_template(template_path: Optional[Path], command: str, placeholder: str = "__FOB_COMMAND__") -> str:
if template_path is not None:
with open(template_path, "r", encoding="utf8") as f:
template = f.read()
if placeholder in template:
command = template.replace(placeholder, command)
else:
command = f"{template}\n{command}\n"
return command
def get_command(experiment_file: Path, index: Optional[str], plot: bool) -> str:
run_script = FOB_EVAL_SCRIPT if plot else FOB_RUN_SCRIPT
disable_plot = "" if plot else "engine.plot=false"
scheduler = "" if index is None else f"engine.run_scheduler=single:{index}"
return f"""srun python -m {run_script} {experiment_file} {scheduler} {disable_plot}"""
def get_job_name(run: Run) -> str:
return f"FOB-{run.task.name}-{run.optimizer.name}"
def get_slurm(job_name: str, args: dict[str, str], log_dir: Path, scripts_dir: Path) -> Slurm:
return Slurm(
job_name,
args,
log_dir=str(log_dir.resolve()),
scripts_dir=str(scripts_dir.resolve()),
bash_strict=False # TODO: maybe add arg or just remove 'nounset'
)
def run_slurm(
job_name: str,
command: str,
args: dict[str, str],
log_dir: Path,
save_sbatch_scripts: Optional[Path] = None,
dependencies: Sequence[int] = tuple(),
dependency_type: str = "afterok"
) -> Optional[int]:
if save_sbatch_scripts is None:
with TemporaryDirectory() as tmpdir:
s = get_slurm(job_name, args, log_dir, scripts_dir=Path(tmpdir).resolve())
return s.run(command, name_addition="", depends_on=dependencies, dependency_type=dependency_type)
else:
s = get_slurm(job_name, args, log_dir, scripts_dir=save_sbatch_scripts)
return s.run(command, name_addition="", depends_on=dependencies, dependency_type=dependency_type)
def run_plotting_job(
experiment_file: Path,
args: dict[str, str],
log_dir: Path,
dependencies: Sequence[int],
template: Optional[Path] = None
) -> None:
args["time"] = seconds_to_str(300) # 5 minutes should be plenty of time to plot
args.pop("array", None)
# no gpus needed for plotting
args.pop("gpus", None)
args.pop("gres", None)
# just one cpu per node for plotting
remove_keys = [k for k in args.keys() if k.startswith("ntasks") or k.startswith("cpus")]
for k in remove_keys:
args.pop(k)
args["nodes"] = "1"
args["ntasks-per-node"] = "1"
args["cpus-per-task"] = "2"
command = get_command(experiment_file, None, plot=True)
command = wrap_template(template, command)
run_slurm("FOB-plot", command, args, log_dir, dependencies=dependencies, dependency_type="afterany")
def slurm_array(runs: list[Run], experiment: dict[str, Any]) -> None:
equal_req = ["devices", "workers", "sbatch_args", "slurm_log_dir", "sbatch_script_template", "run_scheduler"]
argcheck_allequal_engine(runs, equal_req)
run = runs[0] # all runs have the same args
args = run.engine.sbatch_args
log_dir = some(run.engine.slurm_log_dir, default=run.engine.output_dir / "slurm_logs")
if "array" not in args:
args["array"] = f"1-{len(runs)}"
process_args(args, run)
experiment_file = [export_experiment(run, experiment).resolve() for run in runs][0]
command = get_command(experiment_file, "$SLURM_ARRAY_TASK_ID", plot=False)
command = wrap_template(run.engine.sbatch_script_template, command)
job_id = run_slurm(get_job_name(run), command, args, log_dir, save_sbatch_scripts=run.engine.save_sbatch_scripts)
if job_id is not None and run.engine.plot:
run_plotting_job(experiment_file, args, log_dir, [job_id], template=run.engine.sbatch_script_template)
def slurm_jobs(runs: list[Run], experiment: dict[str, Any]) -> list[int]:
job_ids = []
experiment_file = Path()
for i, run in enumerate(runs, start=1):
args = run.engine.sbatch_args
process_args(args, run)
log_dir = some(run.engine.slurm_log_dir, default=run.run_dir / "slurm_logs")
experiment_file = export_experiment(run, experiment).resolve()
command = get_command(experiment_file, str(i), plot=False)
command = wrap_template(run.engine.sbatch_script_template, command)
job_id = run_slurm(get_job_name(run), command, args, log_dir, save_sbatch_scripts=run.engine.save_sbatch_scripts)
if job_id is not None:
job_ids.append(job_id)
if len(job_ids) > 0 and any(map(lambda r: r.engine.plot, runs)):
equal_req = ["slurm_log_dir", "sbatch_script_template"]
argcheck_allequal_engine(runs, equal_req, reason="'engine.plot=true' with 'engine.run_scheduler=slurm_jobs'")
run_plotting_job(experiment_file, args, log_dir, job_ids, template=runs[0].engine.sbatch_script_template)
return job_ids
def sequential(runs: Iterable[Run], n_runs: int, experiment: dict[str, Any]):
for i, run in enumerate(runs, start=1):
log_info(f"Starting run {i}/{n_runs}.")
export_experiment(run, experiment)
try:
run.start()
except RuntimeError as _e: # detect_anomaly raises RuntimeError
t = traceback.format_exc()
log_warn(f"Run {i}/{n_runs} failed with {t}.")

View file

@ -0,0 +1,181 @@
"""
The MIT License (MIT)
Copyright (c) 2015 Brent Pedersen - Bioinformatics
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
Adapted from https://github.com/brentp/slurmpy
"""
from __future__ import print_function
import sys
import os
import subprocess
import tempfile
import atexit
import hashlib
import datetime
from typing import Optional, Sequence
TMPL = """\
#!/bin/bash
#SBATCH -e {log_dir}/{name}.%J.err
#SBATCH -o {log_dir}/{name}.%J.out
#SBATCH -J {name}
{header}
{bash_setup}
__script__"""
def tmp(suffix=".sh"):
t = tempfile.mktemp(suffix=suffix)
atexit.register(os.unlink, t)
return t
class Slurm(object):
def __init__(self, name, slurm_kwargs=None, tmpl=None,
date_in_name=True, scripts_dir="slurm-scripts",
log_dir='logs', bash_strict=True):
if slurm_kwargs is None:
slurm_kwargs = {}
if tmpl is None:
tmpl = TMPL
self.log_dir = log_dir
self.bash_strict = bash_strict
header = []
if 'time' not in slurm_kwargs.keys():
slurm_kwargs['time'] = '84:00:00'
for k, v in slurm_kwargs.items():
if len(k) > 1:
k = "--" + k + "="
else:
k = "-" + k + " "
header.append(f"#SBATCH {k}{v}")
# add bash setup list to collect bash script config
bash_setup = []
if bash_strict:
bash_setup.append("set -eo pipefail -o nounset")
self.header = "\n".join(header)
self.bash_setup = "\n".join(bash_setup)
self.name = "".join(x for x in name.replace(
" ", "-") if x.isalnum() or x == "-")
self.tmpl = tmpl
self.slurm_kwargs = slurm_kwargs
if scripts_dir is not None:
self.scripts_dir = os.path.abspath(scripts_dir)
else:
self.scripts_dir = None
self.date_in_name = bool(date_in_name)
def __str__(self):
return self.tmpl.format(name=self.name, header=self.header,
log_dir=self.log_dir,
bash_setup=self.bash_setup)
def _tmpfile(self):
if self.scripts_dir is None:
return tmp()
else:
for _dir in [self.scripts_dir, self.log_dir]:
if not os.path.exists(_dir):
os.makedirs(_dir)
return f"{self.scripts_dir}/{self.name}.sh"
def run(self,
command: str,
name_addition: Optional[str] = None,
cmd_kwargs: Optional[dict[str, str]] = None,
_cmd: str = "sbatch",
tries: int = 1,
depends_on: Optional[Sequence[int]] = None,
dependency_type: str = "afterok"
) -> Optional[int]:
"""
command: a bash command that you want to run
name_addition: if not specified, the sha1 of the command to run
appended to job name. if it is "date", the yyyy-mm-dd
date will be added to the job name.
cmd_kwargs: dict of extra arguments to fill in command
(so command itself can be a template).
_cmd: submit command (change to "bash" for testing).
tries: try to run a job either this many times or until the first
success.
depends_on: job ids that this depends on before it is run
dependency_type: after, afterok, afterany, afternotok
"""
if name_addition is None:
name_addition = hashlib.sha1(command.encode("utf-8")).hexdigest()
if self.date_in_name:
name_addition += "-" + str(datetime.date.today())
name_addition = name_addition.strip(" -")
if cmd_kwargs is None:
cmd_kwargs = {}
n = self.name
self.name = self.name.strip(" -")
self.name += ("-" + name_addition.strip(" -"))
args = []
for k, v in cmd_kwargs.items():
args.append(f"export {k}={v}")
args = "\n".join(args)
tmpl = str(self).replace("__script__", args + "\n###\n" + command)
if depends_on is None or (len(depends_on) == 1 and depends_on[0] is None):
depends_on = []
with open(self._tmpfile(), "w", encoding="utf8") as sh:
sh.write(tmpl)
job_id = None
for itry in range(1, tries + 1):
args = [_cmd]
if depends_on is not None and len(depends_on) > 0:
dep = f"--dependency={dependency_type}:" + ":".join([str(x) for x in depends_on])
args.append(dep)
if itry > 1:
mid = f"--dependency=afternotok:{job_id}"
args.append(mid)
args.append(sh.name)
res = subprocess.check_output(args).strip()
print(res.decode(), file=sys.stderr)
self.name = n
if not res.startswith(b"Submitted batch"):
return None
j_id = int(res.split()[-1])
if itry == 1:
job_id = j_id
return job_id
if __name__ == "__main__":
import doctest
doctest.testmod()

View file

@ -0,0 +1,228 @@
import logging
from pathlib import Path
from typing import Any, Callable, Iterable, Optional, Type
import json
import math
import signal
import torch
from lightning_utilities.core.rank_zero import rank_zero_only, rank_zero_info, rank_zero_debug, log
def set_loglevel(level: str):
pytorch_logger = logging.getLogger("lightning.pytorch")
match level:
case "debug":
pytorch_logger.setLevel(logging.DEBUG)
case "info":
pytorch_logger.setLevel(logging.INFO)
case "warn":
pytorch_logger.setLevel(logging.WARNING)
case "error":
pytorch_logger.setLevel(logging.ERROR)
case "silent":
pytorch_logger.setLevel(logging.CRITICAL)
@rank_zero_only
def rank_zero_print(*args: Any, **kwargs: Any):
return print(*args, **kwargs)
@rank_zero_only
def log_warn(msg: str, *args: Any, prefix: str = "[FOB WARNING] ", **kwargs: Any):
return log.warning(f"{prefix}{msg}", *args, **kwargs)
def log_info(msg: str, *args: Any, prefix: str = "[FOB INFO] ", **kwargs: Any):
return rank_zero_info(f"{prefix}{msg}", *args, **kwargs)
def log_debug(msg: str, *args: Any, prefix: str = "[FOB DEBUG] ", **kwargs: Any):
return rank_zero_debug(f"{prefix}{msg}", *args, **kwargs)
def write_results(results, filepath: Path):
with open(filepath, "w", encoding="utf8") as f:
json.dump(results, f, indent=4)
print(f"Saved results into {filepath}.")
def wrap_list(x: Any) -> list[Any]:
if isinstance(x, list):
return x
return [x]
def calculate_steps(epochs: int, datapoints: int, devices: int, batch_size: int) -> int:
return math.ceil(datapoints / batch_size / devices) * epochs
def some(*args, default):
"""
returns the first argument that is not None or default.
"""
if len(args) < 1:
return default
first, *rest = args
if first is not None:
return first
return some(*rest, default=default)
def maybe_abspath(path: Optional[str | Path]) -> Optional[Path]:
if path is None:
return None
return Path(path).resolve()
def findfirst(f: Callable, xs: Iterable):
for x in xs:
if f(x):
return x
return None
def trainer_strategy(devices: int | list[int] | str) -> str:
if isinstance(devices, str):
return "auto"
ndevices = devices if isinstance(devices, int) else len(devices)
return "ddp" if ndevices > 1 else "auto"
def gpu_suited_for_compile():
if torch.cuda.is_available():
device_cap = torch.cuda.get_device_capability()
return device_cap in ((7, 0), (8, 0), (9, 0))
def precision_with_fallback(precision: str) -> str:
"""
Check if cuda supports bf16, if not using cuda or if not available return 16 instead of bf16
"""
if not torch.cuda.is_available():
log_warn("Warning: No CUDA available. Results can be different!")
return precision[2:]
if precision.startswith("bf") and not torch.cuda.is_bf16_supported():
log_warn("Warning: GPU does not support bfloat16. Results can be different!")
return precision[2:]
return precision
def str_to_seconds(s: str) -> int:
parts = s.split(":")
assert len(parts) == 3, f"Invalid time format: {s}. Use 'HH:MM:SS'."
return int(parts[0]) * 3600 + int(parts[1]) * 60 + int(parts[2])
def seconds_to_str(total_seconds: int, sep: str = ":") -> str:
hours, rest = divmod(total_seconds, 3600)
minutes, seconds = divmod(rest, 60)
return sep.join(map(lambda x: str(x).zfill(2), [hours, minutes, seconds]))
def begin_timeout(delay=10, show_threads=False):
if show_threads:
import sys
import traceback
import threading
thread_names = {t.ident: t.name for t in threading.enumerate()}
for thread_id, frame in sys._current_frames().items():
print(f"Thread {thread_names.get(thread_id, thread_id)}:")
traceback.print_stack(frame)
print()
signal.alarm(delay) # Timeout after 10 seconds
def path_to_str_inside_dict(d: dict) -> dict:
return convert_type_inside_dict(d, Path, str)
def convert_type_inside_dict(d: dict, src: Type, tgt: Type) -> dict:
ret = {}
for k, v in d.items():
if isinstance(v, dict):
v = convert_type_inside_dict(v, src, tgt)
if isinstance(v, src):
ret[k] = tgt(v)
else:
ret[k] = v
return ret
def dict_differences(custom: dict[str, Any], default: dict[str, Any]) -> dict[str, Any]:
"""
Recursively returns a dictionary with the items in `custom` that are different or missing from `default`.
Example:
>>> dict_differences({"hi": 3, "bla": {"a": 2, "b": 2}}, {"hi": 2, "bla": {"a": 1, "b": 2}})
{'hi': 3, 'bla': {'a': 2}}
"""
diff: dict[str, Any] = {}
for key, value in custom.items():
if key in default:
default_value = default[key]
if default_value == value:
continue
if isinstance(value, dict) and isinstance(default_value, dict):
diff[key] = dict_differences(value, default_value)
else:
diff[key] = value
else:
diff[key] = value
return diff
def concatenate_dict_keys(
d: dict[str, Any],
parent_key: str = "",
sep: str = ".",
exclude_keys: Iterable[str] = tuple()
) -> dict[str, Any]:
"""
Example:
>>> concatenate_dict_keys({ "A": { "B": { "C": 1, "D": 2 }, "E": { "F": 3 } } })
{'A.B.C': 1, 'A.B.D': 2, 'A.E.F': 3}
>>> concatenate_dict_keys({ "A": { "B": { "C": 1, "D": 2 }, "E": { "F": 3 } } }, exclude_keys=["B"])
{'A.E.F': 3}
"""
result = {}
for k, v in d.items():
if k in exclude_keys:
continue
new_key = f"{parent_key}{sep}{k}" if parent_key else k
if isinstance(v, dict):
nested_result = concatenate_dict_keys(v, new_key, sep, exclude_keys)
result.update(nested_result)
else:
result[new_key] = v
return result
def sort_dict_recursively(d: dict) -> dict:
sorted_dict = {}
for k, v in sorted(d.items()):
if isinstance(v, dict):
sorted_dict[k] = sort_dict_recursively(v)
else:
sorted_dict[k] = v
return sorted_dict
class EndlessList(list):
"""
Returns first element if out of bounds. Otherwise same as list.
"""
def __getitem__(self, index):
if index >= len(self) and len(self) > 0:
return self[0]
return super().__getitem__(index)
class AttributeDict(dict):
def __getattribute__(self, key: str) -> Any:
try:
return super().__getattribute__(key)
except AttributeError:
pass
return super().__getitem__(key)

View file

@ -0,0 +1,17 @@
from pathlib import Path
import argparse
from pytorch_fob.engine.engine import Engine
if __name__ == "__main__":
# parsing
parser = argparse.ArgumentParser(description="Create a heatmap plot of benchmarking results.")
parser.add_argument("settings", type=Path,
help="Path to the experiment yaml file.")
args, extra_args = parser.parse_known_args()
if not any(arg.startswith("engine.plot") for arg in extra_args):
extra_args += ["engine.plot=true"]
engine = Engine()
engine.parse_experiment_from_file(args.settings, extra_args=extra_args)
engine.plot()

View file

@ -0,0 +1,131 @@
# Evaluation
During training you can monitor your experiments with [Tensorboard](https://www.tensorflow.org/tensorboard).
We also try to provide some useful functionality to quickly evaluate and compare the results of your experiments.
One can use the ```evaluate_experiment.py``` to get a quick first impression of a finished experiment run.
## Plotting vs. raw data
You can use the plotting pipeline with your customized setting (as shown in the usage examples).
Alternatively you can use the script to export your data to a .csv and process the data to your own needs.
In this scenario, set ```evaluation.output_types: [csv] # no plotting, just the data``` in your experiment yaml.
## Usage Examples
In the following you can find 4 example use cases for experiments and how to visualize the results as heatmaps.
1. testing an optimizer on a task
2. comparing two optimizers on the same task
3. comparing multiple optimizers on different tasks
4. comparing the influence of a single hyperparameter
Here we want to focus on the plotting. For instructions on how to run experiments, refer to the main [README](../../README.md). To get started right away, we provide the data for this example. If you want to reproduce it, refer to [this section](#reproducing-the-data).
### Plotting the experiment
By default, calling the `run_experiment.py` will plot the experiment after training and testing. To disable, set `engine.plot=false`.
To plot your experiment afterwards, call the `evaluate_experiment.py` with the same experiment yaml. To adjust how to plot, change the values under the `evaluation` key of the experiment. Take a look at the [evaluation/default.yaml](default.yaml) to see which settings are available. Some of these keys are explained in the examples below to give the reader a first impression. Note that some default parameters are set in the respective tasks (e.g. in [tasks/mnist/default.yaml](../tasks/mnist/default.yaml)).
### Example use cases
Here are some example scenarios to give you an understanding of how our plotting works. Run the commands from the root of the repository. Take a look at the yaml files used in the command to see what is going on.
#### Example 1
This example is a good starting point; it shows the performance of a single default optimizer on one of the tasks.
Experiment file: [examples/plotting/1_mnist-adamw.yaml](../../examples/plotting/1_mnist-adamw.yaml)
```python -m pytorch_fob.evaluate_experiment examples/plotting/1_mnist-adamw.yaml```
![your plot is not finished yet](../../examples/plotting/1_mnist-adamw-last-heatmap.png)
This example uses only the final model performance and only creates the plot as png.
Helpful settings:
- ```checkpoints: [last]``` # you could use [last, best] to additionaly plot the model with the best validation
- ```output_types: [png]``` # you could use [pdf, png] to also create a pdf
#### Example 2
You can compare two different optimizers.
Experiment file: [examples/plotting/2_adamw-vs-sgd.yaml](../../examples/plotting/2_adamw-vs-sgd.yaml)
```python -m pytorch_fob.evaluate_experiment examples/plotting/2_adamw-vs-sgd.yaml```
![your plot is not finished yet](../../examples/plotting/2_adamw-vs-sgd-last-heatmap.png)
Helpful settings:
- ```plot.x_axis: [optimizer.weight_decay, optimizer.kappa_init_param]``` # the values given here are used as the value for the axis. The order in the list is used from left to right for the plot columns
- `column_split_key: optimizer.name` This creates a column for each different optimizer (default behavior). You can set this to null to disable columns or choose a different key.
#### Example 3
There are multiple tasks in the benchmark, this example shows how to get a quick overview over multiple at the same time.
Experiment file: [examples/plotting/3_mnist-and-tabular_adamw-vs-sgd.yaml](../../examples/plotting/3_mnist-and-tabular_adamw-vs-sgd.yaml)
```python -m pytorch_fob.evaluate_experiment examples/plotting/3_mnist-and-tabular_adamw-vs-sgd.yaml```
![your plot is not finished yet](../../examples/plotting/3_mnist-and-tabular_adamw-vs-sgd-last-heatmap.png)
Helpful settings:
- ```split_groups: ["task.name"]```
Every non unique value for each parameter name in `split_groups` will create its own subplot.
Instead of a list you can set to `false` to disable splitting or `true` to split on every parameter that is different between runs (except those already in `column_split_key` or `aggregate_groups`).
This list is useful if there are just a few parameters you want to split.
#### Example 4
Any parameter that is neither on the x-axis nor y-axis will either be aggregated over or split into subplots.
Any individual square of a heatmap shows the *mean* and *std* over multiple runs (as seen in the previous plots). Here we show how to choose the runs to aggregate.
Experiment file: [examples/plotting/4_adamw-vs-sgd_seeds.yaml](../../examples/plotting/4_adamw-vs-sgd_seeds.yaml)
```python -m pytorch_fob.evaluate_experiment examples/plotting/4_adamw-vs-sgd_seeds.yaml```
![your plot is not finished yet](../../examples/plotting/4_adamw-vs-sgd_seeds-last-heatmap.png)
Helpful settings:
- Control the std with
- ```plot.std``` # toggle off with ```False```
- ```plot.aggfunc: std``` # also try ```var```
- control the rows with
- ```split_groups: ["engine.seed"]```
- ```aggregate_groups: []```
Per default the plot will display the *mean* and *std* calculated over the seeds.
We need to remove the seed from the ```aggregate_groups``` list (by giving an empty list instead). This list is useful if there are additional parameters you want to aggregate over.
-------------------------------------------------------------------------------
### Reproducing the Data
Lets create some data that we can plot; from the root directory call:
#### Data Download
first we make sure the data is already downloaded beforehand:
```python -m pytorch_fob.dataset_setup examples/plotting/3_mnist-and-tabular_adamw-vs-sgd.yaml```
This will download the mnist data (required for 1-4) and tabular (required for 3) into the [examples/data](../../examples/data) directory - path can be changed in the corresponding yaml you want to use (e.g. [examples/plotting/1_mnist-adamw.yaml](../../examples/plotting/1_mnist-adamw.yaml) if you have already set up your benchmark).
Estimated disk usage for the data: ~65M
#### Training
The 2 tasks will be run on 2x2 hyperparameter on 2 different seeds per optimizer for a total of 32 runs.
```python -m pytorch_fob.run_experiment examples/plotting/3_mnist-and-tabular_adamw-vs-sgd.yaml```
After training finished you should find 32 run directories in [examples/plotting/outputs](../../examples/plotting/outputs)
All parameters that differ from the default value are noted in the directory name.

View file

@ -0,0 +1,5 @@
from pathlib import Path
def evaluation_path() -> Path:
return Path(__file__).resolve().parent

View file

@ -0,0 +1,54 @@
evaluation:
data_dirs: null # List of Paths
output_dir: null # output filename is output_dir / experiment_name
experiment_name: null
split_groups: false # {True, False, [param.a, param.b, ...]} create additional plots where the data is grouped by the given parameter; True to detect all params with multiple unique values
aggregate_groups: # groups over which to aggregate values and compute mean/std. Default: [engine.seed]
- engine.seed
depth: 1 # the depth of the trial dirs relative to the given data_dirs
checkpoints: [best, last] # which model checkpoint to use
output_types: [pdf, png, csv] # choose all you want from {csv, pdf, png} and put it in brackets
verbose: False # debug prints
column_split_key: optimizer.name # if set, will split the dataframe and plot it in columns. Default: optimizer.name
column_split_order: null # sets the order in which the columns are plotted.
# keeping the values on null -> automatically figure it out if possible, or let matplotlib decide
plot:
x_axis: # indices on x axis (same order as order of subigures given in data_dirs)
- optimizer.weight_decay
y_axis: # indices on y axis (same order as order of subigures given in data_dirs)
- optimizer.learning_rate
metric: null # is automatically chosen from task name, this will overwrite it
limits: null # sets the limits for the colormap, 2 ints, order does not matter, leave empty for automatic
std: True # show std over aggregated values
aggfunc: std # for example {std, var, sem} which function to use to aggregate over the seeds; will only be used when 'std' is set to true
# format:
# string, how many digits to display, expects two values seperated by a dot (e.g. "2.3")
# to make accuracy -> percent use a '2' in front of the dot
# to display 3 digits after the decimal point, write a '3' behind the dot
format: null # for example {"2.0", "2.1", "2.3", "0.2", ...}
single_file: true # if true, save all heatmaps in one file. 'split_groups' are represented as rows.
plotstyle:
tight_layout: True
text:
usetex: True # you can give latex code in the yaml: $\sqrt{\pi \cdot \sigma}$ but some cluster dont have it installed# the font in the tiles of the matrix
# general font
font:
family: "serif" # matplotlib {serif, sans-serif, cursive, fantasy, monospace}
size: 14
# the font in the tiles of the matrix
matrix_font:
size: 12
scale: 1.0 # scales *figsize* argument by this value, useful for ".png"
color_palette: "rocket"
dpi: 300
# the name of the files storing the hyperparameters of the experiments and the scores
experiment_files:
best_model: results_best_model.json
last_model: results_final_model.json
config: config.yaml

View file

@ -0,0 +1,30 @@
# pretty names for the plot
names:
# optimizer
adamw_baseline: AdamW
sgd_baseline: SGD
adamcpr: AdamCPR
adamcpr_fast: AdamCPR
sgd_stepwise: SGD (stepwise)
# metric
test_acc: Test Accuracy
test_loss: Test Loss
test_mIoU: Test mean Intersection over Union
test_mAcc: Test mean Accuracy
test_rmse: Test Root Mean Square Error (RMSE)
test_rocauc: Test ROC-AUC
# parameter
learning_rate: Learning Rate
weight_decay: Weight Decay
kappa_init_param: Kappa Init Param
# tasks
classification: classification
classification_small: classification_small
detection: detection
graph: graph
graph_tiny: graph_tiny
mnist: mnist
segmentation: segmentation
tabular: tabular
template: template
translation: translation

View file

@ -0,0 +1,564 @@
import json
from pathlib import Path
from os import PathLike
from typing import List, Literal
from itertools import repeat
import matplotlib.pyplot as plt
from matplotlib.figure import Figure
import seaborn as sns
import pandas as pd
from pytorch_fob.engine.parser import YAMLParser
from pytorch_fob.engine.utils import AttributeDict, convert_type_inside_dict, log_warn, log_info, log_debug
from pytorch_fob.evaluation import evaluation_path
def get_available_trials(dirname: Path, config: AttributeDict, depth: int = 1):
"""finds the path for all trials in the *dirname* directory"""
# RECURSIVELY FIND ALL DIRS IN DIRNAME (up to depth)
assert isinstance(dirname, Path)
subdirs: list[Path] = [dirname]
all_results_must_be_same_depth = True
for _ in range(depth):
if all_results_must_be_same_depth:
new_subdirs: list[Path] = []
for subdir in subdirs:
new_subdirs += [x for x in subdir.iterdir() if x.is_dir()]
subdirs = new_subdirs
else:
for subdir in subdirs:
subdirs += [x for x in subdir.iterdir() if x.is_dir()]
format_str = "\n " # f-string expression part cannot include a backslash
log_debug(f"found the following directories:{format_str}{format_str.join(str(i) for i in subdirs)}.")
def is_trial(path: Path):
# here we could do additional checks to filter the subdirectories
# currently we only check if there is a config file
for x in path.iterdir():
found_a_config_file = x.name == config.experiment_files.config
if found_a_config_file:
return True
return False
subdirs = list(filter(is_trial, subdirs[::-1]))
log_debug(f"We assume the following to be trials:{format_str}{format_str.join(str(i) for i in subdirs)}.")
return subdirs
def dataframe_from_trials(trial_dir_paths: List[Path], config: AttributeDict) -> pd.DataFrame:
"""takes result from get_available_trials and packs them in a dataframe,
does not filter duplicate hyperparameter settings."""
dfs: List[pd.DataFrame] = []
for path in trial_dir_paths:
config_file = path / config.experiment_files.config
if config.last_instead_of_best:
result_file = path / config.experiment_files.last_model
else:
result_file = path / config.experiment_files.best_model
all_files_exist = all([
config_file.is_file(),
result_file.is_file()
])
if not all_files_exist:
log_warn(f"WARNING: one or more files are missing in {path}. Skipping this hyperparameter setting." +
f" <{config_file}>: {config_file.is_file()} and\n <{result_file}>: {result_file.is_file()})")
continue
yaml_parser = YAMLParser()
yaml_content = yaml_parser.parse_yaml(config_file)
# convert the sub dicts first, then the dict itself
yaml_content = convert_type_inside_dict(yaml_content, src=dict, tgt=AttributeDict)
yaml_content = AttributeDict(yaml_content)
# use user given value
metric_of_value_to_plot = config.plot.metric
# compute it if user has not given a value
if not metric_of_value_to_plot:
raise ValueError("evaluation.plot.metric is not set")
data = pd.json_normalize(yaml_content)
with open(result_file, "r", encoding="utf8") as f:
content = json.load(f)
if metric_of_value_to_plot in content[0]:
data.at[0, metric_of_value_to_plot] = content[0][metric_of_value_to_plot]
else:
log_warn(f"could not find value for {metric_of_value_to_plot} in json")
dfs.append(data)
if len(dfs) == 0:
raise ValueError("no dataframes found, check your config")
df = pd.concat(dfs, sort=False)
return df
def create_matrix_plot(dataframe: pd.DataFrame, config: AttributeDict, cols: str, idx: str, ax=None,
cbar: bool = True, vmin: None | int = None, vmax: None | int = None):
"""
Creates one heatmap and puts it into the grid of subplots.
Uses pd.pivot_table() and sns.heatmap().
"""
df_entry = dataframe.iloc[0]
metric_name = df_entry["evaluation.plot.metric"]
# CLEANING LAZY USER INPUT
# cols are x-axis, idx are y-axis
if cols not in dataframe.columns:
log_warn("x-axis value not present in the dataframe; did you forget to add a 'optimizer.' as a prefix?\n" +
f" using '{'optimizer.' + cols}' as 'x-axis' instead.")
cols = "optimizer." + cols
if idx not in dataframe.columns:
log_warn("y-axis value not present in the dataframe; did you forget to add a 'optimizer.' as a prefix?\n" +
f" using '{'optimizer.' + idx}' as 'y-axis' instead.")
idx = "optimizer." + idx
# create pivot table and format the score result
pivot_table = pd.pivot_table(dataframe,
columns=cols, index=idx, values=metric_name,
aggfunc='mean')
fmt = None
format_string = dataframe["evaluation.plot.format"].iloc[0]
# scaline the values given by the user to fit his format needs (-> and adapting the limits)
value_exp_factor, decimal_points = format_string.split(".")
value_exp_factor = int(value_exp_factor)
decimal_points = int(decimal_points)
if vmin:
vmin *= (10 ** value_exp_factor)
if vmax:
vmax *= (10 ** value_exp_factor)
pivot_table = (pivot_table * (10 ** value_exp_factor)).round(decimal_points)
fmt=f".{decimal_points}f"
# up to here limits was the min and max over all dataframes,
# usually we want to use user values
if "evaluation.plot.limits" in dataframe.columns:
limits = dataframe["evaluation.plot.limits"].iloc[0]
if limits:
vmin = min(limits)
vmax = max(limits)
log_debug(f"setting cbar limits to {vmin}, {vmax} ")
colormap_name = config.plotstyle.color_palette
low_is_better = dataframe["evaluation.plot.test_metric_mode"].iloc[0] == "min"
if low_is_better:
colormap_name += "_r" # this will "inver" / "flip" the colorbar
colormap = sns.color_palette(colormap_name, as_cmap=True)
metric_legend = pretty_name(metric_name)
# FINETUNE POSITION
# left bottom width height
# cbar_ax = fig.add_axes([0.92, 0.235, 0.02, 0.6])
cbar_ax = None
if not config.plot.std:
return sns.heatmap(pivot_table, ax=ax, cbar_ax=cbar_ax,
annot=True, fmt=fmt,
annot_kws={'fontsize': config.plotstyle.matrix_font.size},
cbar=cbar, vmin=vmin, vmax=vmax, cmap=colormap, cbar_kws={'label': f"{metric_legend}"})
else:
# BUILD STD TABLE
pivot_table_std = pd.pivot_table(dataframe,
columns=cols, index=idx, values=metric_name,
aggfunc=config.plot.aggfunc, fill_value=float("inf"), dropna=False
)
if float("inf") in pivot_table_std.values.flatten():
log_warn("WARNING: Not enough data to calculate the std, skipping std in plot")
pivot_table_std = (pivot_table_std * (10 ** value_exp_factor)).round(decimal_points)
annot_matrix = pivot_table.copy().astype("string")
for i in pivot_table.index:
for j in pivot_table.columns:
mean = pivot_table.loc[i, j]
std = pivot_table_std.loc[i, j]
std_string = f"\n±({round(std, decimal_points)})" if std != float("inf") else "" # type: ignore
annot_matrix.loc[i, j] = f"{round(mean, decimal_points)}{std_string}" # type: ignore
fmt = "" # cannot format like before, as we do not only have a number
return sns.heatmap(pivot_table, ax=ax, cbar_ax=cbar_ax,
annot=annot_matrix, fmt=fmt,
annot_kws={'fontsize': config.plotstyle.matrix_font.size},
cbar=cbar, vmin=vmin, vmax=vmax, cmap=colormap, cbar_kws={'label': f"{metric_legend}"})
def get_all_num_rows_and_their_names(dataframe_list: list[pd.DataFrame], config):
n_rows: list[int] = []
row_names: list[list[str]] = []
for i, df in enumerate(dataframe_list):
x_axis = config.plot.x_axis[i]
y_axis = config.plot.y_axis[0]
metrics = df["evaluation.plot.metric"].unique()
ignored_cols = [x_axis, y_axis]
ignored_cols += list(metrics)
ignored_cols += config.get("ignore_keys", [])
ignored_cols += config.get("aggregate_groups", [])
current_n_rows, current_names = get_num_rows(df, ignored_cols, config)
n_rows.append(current_n_rows)
if not current_names: # will be empty if we have only one row
current_names.append("default")
row_names.append(current_names)
return n_rows, row_names
def get_num_rows(dataframe: pd.DataFrame, ignored_cols: list[str], config: AttributeDict
) -> tuple[int, list[str]]:
"""each matrix has 2 params (on for x and y each), one value, and we aggregate over seeds;
if there are more than than these 4 parameter with different values,
we want to put that in seperate rows instead of aggregating over them.
returning: the number of rows (atleast 1) and the names of the cols"""
necesarry_rows = 0
# the user might specify a value for the groups that we should split on in <split_groups>
whitelisted_cols: list[str] | Literal["all"] = "all" # everything is whitelisted if this value stays 'all'
if isinstance(config.split_groups, list):
whitelisted_cols = config.split_groups[:]
elif config.split_groups is False:
whitelisted_cols = []
columns_with_non_unique_values = []
for col in dataframe.columns:
is_eval_key = col.startswith("evaluation.")
is_ignored = col in ignored_cols
is_whitelisted = whitelisted_cols == "all" or col in whitelisted_cols
if any([is_ignored, is_eval_key, not is_whitelisted]):
if is_whitelisted:
log_warn(f"{col} is in the whitelist, but will be ignored. Probably {col} is in both 'split_groups' and 'aggregate_groups'.")
log_debug(f"ignoring {col}")
continue
nunique = dataframe[col].nunique(dropna=False)
if nunique > 1:
log_debug(f"adding {col} since there are {nunique} unique values")
for unique_hp in dataframe[col].unique():
columns_with_non_unique_values.append(f"{col}={unique_hp}")
necesarry_rows += (nunique) # each unique parameter should be an individal plot
rows_number = max(necesarry_rows, 1)
col_names = columns_with_non_unique_values
log_debug(f"{rows_number=}")
log_debug(f"{col_names=}")
return rows_number, col_names
def find_global_vmin_vmax(dataframe_list, config):
vmin: int | float | None = None
vmax: int | float | None = None
num_cols = len(dataframe_list)
if num_cols > 1:
# all subplots should have same colors -> we need to find the limits
vmin = float('inf')
vmax = float('-inf')
for i in range(num_cols):
dataframe = dataframe_list[i]
cols = config.plot.x_axis[i]
idx = config.plot.y_axis[0]
key = config.plot.metric
pivot_table = pd.pivot_table(dataframe,
columns=cols, index=idx,
values=key,
aggfunc='mean')
min_value_present_in_current_df = pivot_table.min().min()
max_value_present_in_current_df = pivot_table.max().max()
log_debug("colorbar_limits:\n" +
f" subfigure number {i+1}, checking for metric {key}: \n" +
f" min value is {min_value_present_in_current_df},\n" +
f" max value is {max_value_present_in_current_df}")
vmin = min(vmin, min_value_present_in_current_df)
vmax = max(vmax, max_value_present_in_current_df)
return vmin, vmax
def create_figure(dataframe_list: list[pd.DataFrame], config: AttributeDict):
"""
Takes a list of dataframes. Each dataframe is processed into a column of heatmaps.
"""
num_cols: int = len(dataframe_list)
# calculate the number of rows for each dataframe
n_rows, row_names = get_all_num_rows_and_their_names(dataframe_list, config)
# Handling of the number of rows in the plot
# we could either create a full rectangular grid, or allow each subplot to nest subplots
# for nesting we would need to create subfigures instead of subplots i think
if config.split_groups is False:
n_rows_max = 1
row_names = [["default"] for _ in range(num_cols)]
else:
n_rows_max = max(n_rows)
log_debug(f"{n_rows=} and {num_cols=}")
# TODO, figsize was just hardcoded for (1, 2) grid and left to default for (1, 1) grid
# probably not worth the hazzle to create something dynamic (atleast not now)
# EDIT: it was slightly adapted to allow num rows without being completely unreadable
# margin = (num_subfigures - 1) * 0.3
# figsize=(5*n_cols + margin, 2.5)
scale = config.plotstyle.scale
if num_cols == 1 and n_rows_max > 1:
figsize = (2**3 * scale, 2 * 3 * n_rows_max * scale)
elif num_cols == 2:
# TODO: after removing cbar from left subifgure, it is squished
# there is an argument to share the legend, we should use that
figsize = (12 * scale, 5.4 * n_rows_max * scale)
elif num_cols > 2:
figsize = (12 * (num_cols / 2) * scale, 5.4 * n_rows_max * scale)
else:
figsize = None
# TODO: use seaborn FacetGrid
fig, axs = plt.subplots(n_rows_max, num_cols, figsize=figsize)
if n_rows_max == 1:
axs = [axs]
if num_cols == 1:
axs = [[ax] for ax in axs] # adapt for special case so we have unified types
# Adjust left and right margins as needed
# fig.subplots_adjust(left=0.1, right=0.9, top=0.97, hspace=0.38, bottom=0.05,wspace=0.3)
# None -> plt will chose vmin and vmax
vmin, vmax = find_global_vmin_vmax(dataframe_list, config)
for i in range(num_cols):
num_nested_subfigures: int = n_rows[i]
if not config.split_groups:
create_one_grid_element(dataframe_list, config, axs, i,
j=0,
max_i=num_cols,
max_j=0,
vmin=vmin,
vmax=vmax,
n_rows=n_rows,
row_names=row_names)
else:
for j in range(num_nested_subfigures):
create_one_grid_element(dataframe_list, config, axs, i,
j,
max_i=num_cols,
max_j=num_nested_subfigures,
vmin=vmin,
vmax=vmax,
n_rows=n_rows,
row_names=row_names)
if config.plotstyle.tight_layout:
fig.tight_layout()
# SUPTITLE (the super title on top of the whole figure in the middle)
# # TODO super title might be squished when used together with tight layout (removing for now)
# if n_rows_max > 1 or num_cols > 1:
# # set experiment name as title when multiple matrices in image
# if config.experiment_name:
# fig.suptitle(config.experiment_name)
return fig, axs
def create_one_grid_element(dataframe_list: list[pd.DataFrame], config: AttributeDict, axs,
i: int, j: int, max_i: int, max_j: int, vmin, vmax, n_rows, row_names):
"""does one 'axs' element as it is called in plt"""
num_nested_subfigures: int = n_rows[i]
name_for_additional_subplots: list[str] = row_names[i]
num_subfigures = max_i # from left to right
num_nested_subfigures = max_j # from top to bottom
dataframe = dataframe_list[i]
cols = config.plot.x_axis[i]
idx = config.plot.y_axis[0]
# only include colorbar once
include_cbar: bool = i == num_subfigures - 1
model_param = name_for_additional_subplots[j]
if model_param == "default":
current_dataframe = dataframe # we do not need to do further grouping
else:
param_name, param_value = model_param.split("=", maxsplit=1)
if pd.api.types.is_numeric_dtype(dataframe[param_name]):
param_value = float(param_value)
try:
current_dataframe = dataframe.groupby([param_name]).get_group((param_value,))
except KeyError:
log_warn(f"WARNING: was not able to groupby '{param_name}'," +
"maybe the data was created with different versions of fob; skipping this row")
log_debug(f"{param_name=}{param_value=}{dataframe.columns=}{dataframe[param_name]=}")
return False
current_plot = create_matrix_plot(current_dataframe, config,
cols, idx,
ax=axs[j][i],
cbar=include_cbar, vmin=vmin, vmax=vmax)
# LABELS
# Pretty name for label "learning_rate" => "Learning Rate"
# remove x_label of all but last row, remove y_label for all but first column
if i > 0:
current_plot.set_ylabel('', labelpad=8)
else:
current_plot.set_ylabel(pretty_name(current_plot.get_ylabel()))
if j < num_nested_subfigures - 1:
current_plot.set_xlabel('', labelpad=8)
else:
current_plot.set_xlabel(pretty_name(current_plot.get_xlabel()))
# reading optimizer and task name after grouping
df_entry = current_dataframe.iloc[0] # just get an arbitrary trial
opti_name = df_entry['optimizer.name']
task_name = df_entry['task.name']
# TITLE
# title (heading) of the heatmap: <optimname> on <taskname> (+ additional info)
title = f"{pretty_name(opti_name)} on {pretty_name(task_name)}"
if max_i > 1 or max_j > 1:
title += "" if model_param == "default" else f"\n{model_param}"
current_plot.set_title(title)
def extract_dataframes(workload_paths: List[Path], config: AttributeDict, depth: int = 1
) -> list[pd.DataFrame]:
df_list: list[pd.DataFrame] = []
num_dataframes: int = len(workload_paths)
for i in range(num_dataframes):
available_trials = get_available_trials(workload_paths[i], config, depth)
dataframe = dataframe_from_trials(available_trials, config)
df_list.append(dataframe)
return df_list
def get_output_file_path(dataframe_list: list[pd.DataFrame], config: AttributeDict, suffix: str = "") -> Path:
task_names = [df.iloc[0]["task.name"] for df in dataframe_list]
optim_names = [df.iloc[0]["optimizer.name"] for df in dataframe_list]
task_name = "_".join(sorted(set(task_names)))
optim_name = "_".join(sorted(set(optim_names)))
here = Path(__file__).parent.resolve()
output_dir = Path(config.output_dir) if config.output_dir else here
experiment_name = Path(config.experiment_name) if config.experiment_name else f"{optim_name}-{task_name}"
output_file_path = output_dir / experiment_name
return Path(f"{output_file_path}-{suffix}" if suffix else output_file_path)
def set_plotstyle(config: AttributeDict):
plt.rcParams["text.usetex"] = config.plotstyle.text.usetex
plt.rcParams["font.family"] = config.plotstyle.font.family
plt.rcParams["font.size"] = config.plotstyle.font.size
def pretty_name(name: str, pretty_names: dict | str = {}) -> str: # type: ignore pylint: disable=dangerous-default-value
"""
Tries to use a mapping for the name, else will do some general replacement.
mapping can be a directory or a filename of a yaml file with 'names' key
"""
# reading from yaml and caching the dictionary
label_file: Path = evaluation_path() / "labels.yaml"
if isinstance(pretty_names, str):
label_file = Path(pretty_names)
if pretty_names == {} or isinstance(pretty_names, str):
yaml_parser = YAMLParser()
yaml_content = yaml_parser.parse_yaml(label_file)
pretty_names: dict[str, str] = yaml_content["names"]
# applying pretty names
name_without_yaml_prefix = name.split(".")[-1]
if name in pretty_names.keys():
name = pretty_names[name]
elif name_without_yaml_prefix in pretty_names.keys():
name = pretty_names[name_without_yaml_prefix]
else:
name = name.replace('_', ' ').title()
return name
def save_csv(dfs: list[pd.DataFrame], output_filename: Path):
for i, df in enumerate(dfs):
csv_output_filename = f"{output_filename.resolve()}-{i}.csv"
log_info(f"saving raw data as {csv_output_filename}")
df.to_csv(path_or_buf=csv_output_filename, index=False)
def save_plot(fig: Figure, output_file_path: Path, file_type: str, dpi: int):
plot_output_filename = f"{output_file_path.resolve()}.{file_type}"
log_info(f"saving figure as <{plot_output_filename}>")
fig.savefig(plot_output_filename, dpi=dpi)
def save_files(fig, dfs: list[pd.DataFrame], output_file_path: Path, config: AttributeDict):
output_file_path.parent.mkdir(parents=True, exist_ok=True)
for file_type in config.output_types:
if file_type == "csv":
save_csv(dfs, output_file_path)
elif file_type == "png" or file_type == "pdf":
save_plot(fig, output_file_path, file_type, config.plotstyle.dpi)
def clean_config(config: AttributeDict) -> AttributeDict:
"""some processing that allows the user to be lazy, shortcut for the namespace, hidden values are found and config.all_values"""
if "evaluation" in config.keys():
evaluation_config: AttributeDict = config.evaluation
evaluation_config["all_values"] = config
config = evaluation_config
else:
log_warn("there is no 'evaluation' in the yaml provided!")
if "data_dirs" in config.keys():
value_is_none = not config.data_dirs
value_has_wrong_type = not isinstance(config.data_dirs, (PathLike, str, list))
if value_is_none or value_has_wrong_type:
raise ValueError(f"Error: 'evaluation.data_dirs' was not provided correctly! check for typos in the yaml provided! value given: {config.data_dirs}")
# allow the user to write a single string instead of a list of strings
if not isinstance(config.output_types, list):
config["output_types"] = [config.output_types]
log_info("fixing value for key <config.output_types> to be a list[str]")
if not isinstance(config.data_dirs, list):
config["data_dirs"] = [Path(config.data_dirs)]
log_info("fixing value for key <config.data_dirs> to be a list[Path]")
# x_axis
if not isinstance(config.plot.x_axis, list):
config["plot"]["x_axis"] = [config.plot.x_axis]
log_info("fixing value for key <config.plot.x_axis> to be a list[str]")
if len(config.plot.x_axis) < len(config.data_dirs):
# use same x axis for all if only one given
missing_elements = len(config.data_dirs) - len(config.plot.x_axis)
config["plot"]["x_axis"] += repeat(config.plot.x_axis[0], missing_elements)
# y_axis
if not isinstance(config.plot.y_axis, list):
config["plot"]["y_axis"] = [config.plot.y_axis]
log_info("fixing value for key <config.plot.y_axis> to be a list[str]")
if len(config.plot.y_axis) < len(config.data_dirs):
# use same x axis for all if only one given
missing_elements = len(config.data_dirs) - len(config.plot.y_axis)
config["plot"]["y_axis"] += repeat(config.plot.y_axis[0], missing_elements)
return config
def main(config: AttributeDict):
config = clean_config(config) # sets config to config.evaluation, cleans some data
workloads: List[Path] = [Path(name) for name in config.data_dirs]
log_debug(f"{workloads}=")
set_plotstyle(config)
dfs = extract_dataframes(workloads, depth=config.depth, config=config)
fig, _ = create_figure(dfs, config)
output_file_path = get_output_file_path(dfs, config)
save_files(fig, dfs, output_file_path, config)

View file

@ -0,0 +1,37 @@
# Optimizers
We currently have the following optimizers:
| Name | Optimizer | LR Scheduler |
| ---- | --------- | ------------ |
| adamw_baseline | [AdamW](https://arxiv.org/abs/1711.05101) | [Cosine Annealing](https://arxiv.org/abs/1608.03983) with linear warmup |
| adamcpr | [AdamCPR](https://arxiv.org/abs/2311.09058v2) | [Cosine Annealing](https://arxiv.org/abs/1608.03983) with linear warmup |
| sgd_baseline | Stochastic Gradient Descent | [Cosine Annealing](https://arxiv.org/abs/1608.03983) |
| sgd_stepwise | [Stochastic Gradient Descent](https://pytorch.org/docs/stable/generated/torch.optim.SGD.html#torch.optim.SGD) | [StepLR](https://pytorch.org/docs/stable/generated/torch.optim.lr_scheduler.StepLR.html) |
| adafactor | [Adafactor](https://arxiv.org/abs/1804.04235) | Constant |
## Creating your own optimizer
To add your own optimizer, you need to create a subfolder in the `optimizers` directory. The name of that folder will be the name used to invoke the optimizer. Within the folder you need to provide two files: `optimizer.py` and `default.yaml`. There is a `template` optimizer with useful comments, which can be used as a starting point.
### optimizer.py
Here you need to implement a function `configure_optimizers` with the following signature:
```python
configure_optimizers(model: GroupedModel, config: OptimizerConfig) -> OptimizerLRScheduler
```
- The return type is the same as described [here](https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.core.LightningModule.html#lightning.pytorch.core.LightningModule.configure_optimizers).
- The `GroupedModel` is a wrapper around a `torch.nn.Module`. It additionally provides a method `grouped_parameters`, which returns the model parameters grouped by their `weight_decay` and `learning_rate` settings. This is useful for some tasks that want to use e.g. lower learning rates for different parts of the model or to avoid applying weight decay to your norm layers. The underlying `torch.nn.Module` can be accessed with `model.model`.
- The `OptimizerConfig` has the `lr_interval, max_steps, max_epochs` attributes. It also gains all attributes provided in the `optimizer` section of the `experiment.yaml`.
### default.yaml
Here you can provide default values for all the hyperparameters your optimizer needs. These values will be added to the `OptimizerConfig` passed to the `configure_optimizers`. So if you have the following `default.yaml`:
```yaml
optimizer:
name: my_awesome_optimizer
output_dir_name: my_awesome_optimizer
learning_rate: 1.e-3
important:
extra:
parameter: 42
```
you could use `config.important.extra.parameter` in the `configure_optimizers` function.

View file

@ -0,0 +1,4 @@
from .optimizers import optimizer_names, optimizer_path, Optimizer
from .lr_schedulers import lr_schedulers_path
__all__ = ["Optimizer", "optimizer_names", "optimizer_path", "lr_schedulers_path"]

View file

@ -0,0 +1,27 @@
import importlib
from pathlib import Path
from lightning.pytorch.utilities.types import OptimizerLRScheduler
from pytorch_fob.engine.parameter_groups import GroupedModel
from pytorch_fob.engine.configs import OptimizerConfig
def import_optimizer(name: str):
return importlib.import_module(f"pytorch_fob.optimizers.{name}.optimizer")
def optimizer_path(name: str) -> Path:
return Path(__file__).resolve().parent / name
def optimizer_names() -> list[str]:
EXCLUDE = ["__pycache__", "lr_schedulers"]
return [d.name for d in Path(__file__).parent.iterdir() if d.is_dir() and d.name not in EXCLUDE]
class Optimizer():
def __init__(self, config: OptimizerConfig) -> None:
self.config = config
def configure_optimizers(self, model: GroupedModel) -> OptimizerLRScheduler:
optimizer_module = import_optimizer(self.config.name)
return optimizer_module.configure_optimizers(model, self.config)

View file

@ -0,0 +1,25 @@
from pathlib import Path
import argparse
from pytorch_fob.engine.engine import Engine
from pytorch_fob.engine.utils import set_loglevel
def main(args: argparse.Namespace, extra_args: list[str]):
engine = Engine()
engine.parse_experiment_from_file(args.experiment_file, extra_args=extra_args)
engine.run_experiment()
engine.plot()
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="runs an experiment specified by a file"
)
parser.add_argument("experiment_file", type=Path,
help="The yaml file specifying the experiment.")
parser.add_argument("--log_level", type=str, choices=["debug", "info", "warn", "silent"], default="info",
help="Set the log level")
args, extra_args = parser.parse_known_args()
set_loglevel(args.log_level)
main(args, extra_args)

View file

@ -0,0 +1,98 @@
# Tasks
We provide a set of tasks to train and evaluate models. A task consists of a model and a dataset.
Each task has their own `README.md` file with more details.
We currently have the following tasks:
### Ready to use
| Name | Dataset | Model | Task | Target Metric | Baseline Score | Baseline Runtime | Hardware |
| ------- | ---- | ----- | ---- | ------------- | -------------- | ---------------- | -------- |
| [mnist](mnist) | MNIST | MLP | Image Classification | Top-1 Accuracy | 0.97 | 1 min | 1 gpu |
| [classification](classification) | [Imagenet-64x64](https://patrykchrabaszcz.github.io/Imagenet32/) | [Wide ResNet](https://arxiv.org/pdf/1605.07146.pdf) | Image Classification | Top-1 Accuracy | 0.69 | 4h | 4 gpu |
| [classification_small](classification_small) | [CIFAR100](https://www.cs.toronto.edu/~kriz/cifar.html) | [Resnet18](https://arxiv.org/pdf/1512.03385.pdf) | Image Classification | Top-1 Accuracy | 0.77 | 10 min | 1 gpu |
| [segmentation](segmentation) | [MIT Scene Parse](http://sceneparsing.csail.mit.edu/) | [SegFormer](https://arxiv.org/abs/2105.15203) | Semantic Segmentation | Intersection over Union (IoU) | 0.35 | 5h | 4 gpu |
| [graph](graph) | [ogbg-molhiv](https://ogb.stanford.edu/docs/graphprop/#ogbg-mol) | [Graph Isomorphism Network (GIN)](https://arxiv.org/pdf/1810.00826.pdf) | Graph Property Prediction | ROC-AUC | 0.77 | 20min | 1 gpu |
| [graph_tiny](graph_tiny) | [Cora](https://paperswithcode.com/sota/node-classification-on-cora) | [GCN](https://arxiv.org/abs/1609.02907) | Node Classification | Accuracy | 0.82 | 1min | 1 gpu |
| [tabular](tabular) | [California Housing](https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html) | [FT Transformer](https://arxiv.org/pdf/2106.11959.pdf) | Tabular Regression | Test RMSE | 0.40 | 2 min | 1 gpu |
| [translation](translation) | [WMT17(en-de)](https://machinetranslate.org/wmt17) | [T5 small](https://jmlr.org/papers/volume21/20-074/20-074.pdf) | Machine Translation | BLEU (sacrebleu) | 26.3 | 6h | 4 gpus |
### Under Development
| Name | Dataset | Model | Task | Target Metric | Baseline Score | Baseline Runtime | Hardware |
| ------- | ----- | ----- | ---- | ------------- | -------------- | ---------------- | -------- |
| [detection](pytorch_fob/tasks/detection) | [COCO](https://cocodataset.org) | [Faster R-CNN](https://arxiv.org/abs/1506.01497) with [MobileNet v3](https://arxiv.org/abs/1905.02244) backbone | Object detection | Average Precision (IoU) | ? | ~4h | 4 gpus |
| rna_folding | bpRNA | RNAformer | RNA secondary structure prediction | F1 | ? | ~4h | 4 gpus |
## Adding your own task
To add your own task, you need to create a subfolder in the `tasks` directory. The name of that folder will be the name used to invoke the task. Within the folder you need to provide the following files: `task.py`, `model.py`, `data.py`, `default.yaml` and `README.md`.
There is a [template](template) task with useful comments, which can be used as a starting point.
### data.py
Here you provide the code for interacting with your dataset. As we use [lightning](https://lightning.ai/docs/pytorch/stable/), you will need to crate a [LightningDataModule Datamodule](https://lightning.ai/docs/pytorch/stable/data/datamodule.html).
The class you create must inherit from `TaskDataModule` which in turn inherits from `LightningDataModule`. The base `TaskDataModule` already defines some default methods for the dataloader methods, so if you do not need any custom dataloaders you can probably leave them.
The two methods you need to implement are [prepare_data](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#prepare-data) and [setup](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#setup). In `prepare_data` you need to put you downloading and data preprocessing logic. In `setup` you should load and split your dataset and set the `self.data_train, self.data_val, self.data_test` attributes in the appropriate stages.
### model.py
Here you provide the code for the model. As we use [lightning](https://lightning.ai/docs/pytorch/stable/), you will need to create a [LightningModule](https://lightning.ai/docs/pytorch/stable/common/lightning_module.html).
The class you create must inherit from `TaskModel` which in turn inherits from `LightningModule`. The `__init__` method should have the following signature:
```python
def __init__(self, optimizer: Optimizer, config: TaskConfig):
```
In the `__init__` method you need to create your model, and pass it to the `super().__init__` call. There the model is wrapped into a `GroupedModel` which splits the model parameters into weight_decay and non-weight_decay groups. If you want to specify your own parameter groups (e.g. for different learning rates) you need to wrap your model in a `GroupedModel` yourself, before passing it to the `super().__init__` call.
The other methods you neet to implement are [training_step](https://lightning.ai/docs/pytorch/stable/common/lightning_module.html#training-step), [validation_step](https://lightning.ai/docs/pytorch/stable/common/lightning_module.html#validation-step) and [test_step](https://lightning.ai/docs/pytorch/stable/common/lightning_module.html#test-step). Here you need to implement the training and evaluation logic.
### task.py
Here you only need to provide two simple functions:
```python
def get_datamodule(config: TaskConfig) -> TaskDataModule
```
which returns an instance of your `DataModule` class, and
```python
def get_task(optimizer: Optimizer, config: TaskConfig) -> tuple[TaskModel, TaskDataModule]
```
which returns an instance of your `TaskModel` class and an instance of your `DataModule` class.
### default.yaml
Here you can provide default values for all the hyperparameters your task needs. All keys under the `task` section will be added to the `TaskConfig`.
There are some required parameters you need to specify:
```yaml
task:
name: my_awesome_task # same as directory name
batch_size: 123
max_epochs: 42
max_steps: null # should be left null, use max_epochs instead
target_metric: val_acc # choose a metric that is being logged in your LightningModule
target_metric_mode: min # min or max
engine:
devices: 1 # number of devices to use
sbatch_args:
time: 00:05:00 # estimated time to train
evaluation:
plot:
metric: test_acc
test_metric_mode: min
format: "2.1"
limits: [0, 100] # colorbar limits
optimizer:
name: adamw_baseline # set the default optimizer
```
You can optionally set and override optimizer defaults, e.g.:
```yaml
optimizer:
name: adamw_baseline
learning_rate: 0.1
```
would use a default learning rate of 0.1 instead of the one specified in the `default.yaml` of the optimizer. Note that this applies to all optimizers. So if the user chooses a different optimizer, they will still get the default learning rate specified here.
### README.md
Here you should provide a short description of your task, and a baseline performance. Follow the template as seen in the existing tasks.

View file

@ -0,0 +1 @@
from .tasks import task_names, task_path, import_task, TaskModel, TaskDataModule

View file

@ -0,0 +1,119 @@
import importlib
import time
from typing import Any, Callable, Optional
from pathlib import Path
from lightning import LightningModule, LightningDataModule
from lightning.pytorch.core.optimizer import LightningOptimizer
from lightning.pytorch.utilities.types import OptimizerLRScheduler
import torch
from torch import nn
from torch.utils.data import DataLoader
from pytorch_fob.optimizers import Optimizer
from pytorch_fob.engine.configs import TaskConfig
from pytorch_fob.engine.parameter_groups import GroupedModel
def import_task(name: str):
return importlib.import_module(f"pytorch_fob.tasks.{name}.task")
def task_path(name: str) -> Path:
return Path(__file__).resolve().parent / name
def task_names() -> list[str]:
EXCLUDE = ["__pycache__"]
return [d.name for d in Path(__file__).parent.iterdir() if d.is_dir() and d.name not in EXCLUDE]
class TaskModel(LightningModule):
def __init__(
self,
model: nn.Module | GroupedModel,
optimizer: Optimizer,
config: TaskConfig,
**kwargs: Any
) -> None:
super().__init__(**kwargs)
self.config = config
self.optimizer = optimizer
self.model = model if isinstance(model, GroupedModel) else GroupedModel(model)
self.optimizer_times_ms = []
def forward(self, *args, **kwargs):
return self.model.forward(*args, **kwargs)
def configure_optimizers(self) -> OptimizerLRScheduler:
return self.optimizer.configure_optimizers(self.model)
def optimizer_step(
self,
epoch: int,
batch_idx: int,
optimizer: torch.optim.Optimizer | LightningOptimizer,
optimizer_closure: Optional[Callable[[], Any]] = None,
) -> None:
start = time.time_ns()
optimizer.step(closure=optimizer_closure) # type: ignore
end = time.time_ns()
duration_ms = (end - start) / 1e6
self.optimizer_times_ms.append(duration_ms)
class TaskDataModule(LightningDataModule):
def __init__(self, config: TaskConfig) -> None:
super().__init__()
self.config = config
self.workers: int = min(config.workers, 16)
self.data_dir: Path = config.data_dir / config.name
self.batch_size: int = config.batch_size
self.data_train: Any
self.data_val: Any
self.data_test: Any
self.data_predict: Any
self.collate_fn = None
def check_dataset(self, data):
"""Make sure that all tasks have correctly configured their data sets"""
if not data:
raise NotImplementedError("Each task has its own data set")
if not self.batch_size or self.batch_size < 1:
raise NotImplementedError("Each task configures its own batch_size. \
Please set it explicitely, to avoid confusion.")
def train_dataloader(self):
self.check_dataset(self.data_train)
return DataLoader(
self.data_train,
shuffle=True,
batch_size=self.batch_size,
num_workers=self.workers,
collate_fn=self.collate_fn
)
def val_dataloader(self):
self.check_dataset(self.data_val)
return DataLoader(
self.data_val,
batch_size=self.batch_size,
num_workers=self.workers,
collate_fn=self.collate_fn
)
def test_dataloader(self):
self.check_dataset(self.data_test)
return DataLoader(
self.data_test,
batch_size=self.batch_size,
num_workers=self.workers,
collate_fn=self.collate_fn
)
def predict_dataloader(self):
self.check_dataset(self.data_predict)
return DataLoader(
self.data_predict,
batch_size=self.batch_size,
num_workers=self.workers,
collate_fn=self.collate_fn
)