mirror of
https://github.com/InternLM/InternBootcamp.git
synced 2026-04-19 12:58:04 +00:00
init-commit
This commit is contained in:
commit
18a552597a
3461 changed files with 1150579 additions and 0 deletions
18
examples/xpuyu_usage/bootcamp_rl/datasets/__init__.py
Executable file
18
examples/xpuyu_usage/bootcamp_rl/datasets/__init__.py
Executable file
|
|
@ -0,0 +1,18 @@
|
|||
# Copyright (c) InternLM. All rights reserved.
|
||||
from .prompt import bootcampPromptDataset, PromptCollator, InfiniteDataLoaderIter
|
||||
from .trajectory import (
|
||||
InferDataset,
|
||||
TrajectoryCollator,
|
||||
TrajectoryDataset,
|
||||
TrajectoryDatasetWithFilter,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"bootcampPromptDataset",
|
||||
"PromptCollator",
|
||||
"InferDataset",
|
||||
"TrajectoryDataset",
|
||||
"TrajectoryDatasetWithFilter",
|
||||
"TrajectoryCollator",
|
||||
"InfiniteDataLoaderIter",
|
||||
]
|
||||
214
examples/xpuyu_usage/bootcamp_rl/datasets/prompt.py
Executable file
214
examples/xpuyu_usage/bootcamp_rl/datasets/prompt.py
Executable file
|
|
@ -0,0 +1,214 @@
|
|||
# Copyright (c) InternLM. All rights reserved.
|
||||
import json
|
||||
|
||||
import time
|
||||
import numpy as np
|
||||
import torch
|
||||
from datasets import load_dataset
|
||||
from torch.nn.utils.rnn import pad_sequence
|
||||
from torch.utils.data import Dataset
|
||||
from xtuner._lite import get_logger
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
|
||||
def load_hf_datasets(repo, split="train"):
|
||||
dataset = load_dataset(repo, split=split)
|
||||
converted_ds = []
|
||||
for sample in dataset:
|
||||
converted_ds.append(
|
||||
{
|
||||
"pass_rate": sample["pass_rate"],
|
||||
"message_data": [{"role": "user", "content": sample["question"]}],
|
||||
"metadata": {
|
||||
"data_source": "math", # for the router to know which judger to use
|
||||
"gold_answer": sample["gold_answer"],
|
||||
},
|
||||
}
|
||||
)
|
||||
logger.info(f"Loaded {len(converted_ds)} samples from {repo}")
|
||||
return converted_ds
|
||||
|
||||
|
||||
def load_jsonl_datasets(file_path):
|
||||
subsample_ratio = 1.0
|
||||
if "::" in file_path:
|
||||
file_path, subsample_ratio = file_path.split("::")
|
||||
subsample_ratio = float(subsample_ratio)
|
||||
with open(file_path, "r") as f:
|
||||
lines = f.readlines()
|
||||
datasets = []
|
||||
for line in lines:
|
||||
sample = json.loads(line)
|
||||
if "message_data" not in sample:
|
||||
datasets.append(
|
||||
{
|
||||
"pass_rate": sample["pass_rate"],
|
||||
"message_data": [{"role": "user", "content": sample["question"]}],
|
||||
"metadata": {
|
||||
"data_source": "math", # for the router to know which judger to use
|
||||
"gold_answer": sample["gold_answer"],
|
||||
},
|
||||
}
|
||||
)
|
||||
else:
|
||||
datasets.append(sample)
|
||||
if subsample_ratio < 1.0:
|
||||
np.random.seed(0)
|
||||
datasets = np.random.choice(
|
||||
datasets, int(len(datasets) * subsample_ratio), replace=False
|
||||
).tolist()
|
||||
|
||||
logger.info(f"Loaded {len(datasets)} samples from {file_path}")
|
||||
return datasets
|
||||
|
||||
|
||||
def balance_difficulty_with_cfg(dataset, difficulty_balance_cfg):
|
||||
balanced_dataset = []
|
||||
for sample in dataset:
|
||||
pass_rate = sample["pass_rate"]
|
||||
for (low, high), repeat in difficulty_balance_cfg:
|
||||
if low <= pass_rate < high:
|
||||
balanced_dataset.extend([sample] * repeat)
|
||||
break
|
||||
logger.info(
|
||||
f"After difficulty balancing, the dataset size is {len(balanced_dataset)}"
|
||||
)
|
||||
return balanced_dataset
|
||||
|
||||
|
||||
class bootcampPromptDataset(Dataset):
|
||||
def __init__(self, path, tokenizer, difficulty_balance_cfg=None):
|
||||
if isinstance(path, str):
|
||||
path = [path]
|
||||
dataset = []
|
||||
for p in path:
|
||||
if p.endswith(".jsonl"):
|
||||
dataset.extend(load_jsonl_datasets(p))
|
||||
else:
|
||||
dataset.extend(load_hf_datasets(p))
|
||||
if difficulty_balance_cfg:
|
||||
dataset = balance_difficulty_with_cfg(dataset, difficulty_balance_cfg)
|
||||
self.dataset = dataset
|
||||
self.tokenizer = tokenizer
|
||||
|
||||
def __len__(self):
|
||||
return len(self.dataset)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
sample = self.dataset[idx]
|
||||
input_ids = self.tokenizer.apply_chat_template(
|
||||
sample["message_data"], add_generation_prompt=True
|
||||
)
|
||||
sample["input_ids"] = input_ids
|
||||
sample["labels"] = input_ids
|
||||
sample["num_tokens"] = len(input_ids)
|
||||
return sample
|
||||
|
||||
|
||||
class PromptCollator:
|
||||
|
||||
def __init__(self, pad_token_id=0, ignore_id=-100, pack_batch=False):
|
||||
self.pack_batch = pack_batch
|
||||
self.pad_token_id = pad_token_id
|
||||
self.ignore_id = ignore_id
|
||||
|
||||
def __call__(self, instances):
|
||||
|
||||
_instances = []
|
||||
for ins in instances:
|
||||
if isinstance(ins, list):
|
||||
_instances.extend(ins)
|
||||
else:
|
||||
_instances.append(ins)
|
||||
|
||||
instances = _instances
|
||||
|
||||
input_ids = []
|
||||
labels = []
|
||||
num_tokens = []
|
||||
metadatas = []
|
||||
message_datas = []
|
||||
|
||||
for data in instances:
|
||||
|
||||
input_ids.append(torch.LongTensor(data["input_ids"]))
|
||||
labels.append(torch.LongTensor(data["labels"]))
|
||||
metadatas.append(data["metadata"])
|
||||
message_datas.append(data["message_data"])
|
||||
|
||||
if isinstance(data["num_tokens"], int):
|
||||
num_tokens.append(data["num_tokens"])
|
||||
else:
|
||||
num_tokens.extend(data["num_tokens"])
|
||||
|
||||
attention_mask = [torch.ones_like(ids) for ids in input_ids]
|
||||
num_tokens = torch.IntTensor(num_tokens)
|
||||
|
||||
if len(instances) > 1 and self.pack_batch:
|
||||
|
||||
input_ids = torch.cat(input_ids, dim=0).unsqueeze(0)
|
||||
labels = torch.cat(labels, dim=0).unsqueeze(0)
|
||||
attention_mask = torch.cat(attention_mask, dim=0).unsqueeze(0)
|
||||
|
||||
elif len(instances) > 1 and not self.pack_batch:
|
||||
|
||||
input_ids = pad_sequence(
|
||||
input_ids, batch_first=True, padding_value=self.pad_token_id
|
||||
)
|
||||
labels = pad_sequence(
|
||||
labels, batch_first=True, padding_value=self.ignore_id
|
||||
)
|
||||
attention_mask = pad_sequence(
|
||||
attention_mask, batch_first=True, padding_value=0
|
||||
)
|
||||
else:
|
||||
input_ids = torch.stack(input_ids)
|
||||
labels = torch.stack(labels)
|
||||
attention_mask = torch.stack(attention_mask)
|
||||
|
||||
if input_ids.shape != labels.shape:
|
||||
logger.error(f"[instances] {instances}")
|
||||
logger.error(f"[num_tokens] {num_tokens}")
|
||||
logger.error(f"[input_ids] {input_ids}")
|
||||
logger.error(f"[labels] {labels}")
|
||||
raise RuntimeError(
|
||||
"The shape of input_ids and labels must be "
|
||||
f"equal, but found {input_ids.shape} and "
|
||||
f"{labels.shape}."
|
||||
)
|
||||
data_dict = {
|
||||
"input_ids": input_ids,
|
||||
"labels": labels,
|
||||
"num_tokens": num_tokens,
|
||||
"attention_mask": attention_mask.bool(),
|
||||
"metadata": metadatas,
|
||||
"message_data": message_datas,
|
||||
}
|
||||
|
||||
return data_dict
|
||||
|
||||
class InfiniteDataLoaderIter:
|
||||
def __init__(self, dataloader):
|
||||
self.dataloader = dataloader
|
||||
self.iterator = iter(dataloader)
|
||||
self._epoch = 0
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def __next__(self):
|
||||
try:
|
||||
data = next(self.iterator)
|
||||
except StopIteration:
|
||||
logger.info(f"Dataloader epoch {self._epoch} finished. Start a new epoch.")
|
||||
self._epoch += 1
|
||||
if hasattr(self.dataloader, 'sampler') and hasattr(
|
||||
self.dataloader.sampler, 'set_epoch'):
|
||||
# In case the` _SingleProcessDataLoaderIter` has no sampler,
|
||||
# or data loader uses `SequentialSampler` in Pytorch.
|
||||
self.dataloader.sampler.set_epoch(self._epoch)
|
||||
time.sleep(2) # Prevent possible deadlock during epoch transition
|
||||
self.iterator = iter(self.dataloader)
|
||||
data = next(self.iterator)
|
||||
return data
|
||||
166
examples/xpuyu_usage/bootcamp_rl/datasets/trajectory.py
Executable file
166
examples/xpuyu_usage/bootcamp_rl/datasets/trajectory.py
Executable file
|
|
@ -0,0 +1,166 @@
|
|||
# Copyright (c) InternLM. All rights reserved.
|
||||
import json
|
||||
import random
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from xtuner._lite import get_logger
|
||||
from xtuner._lite.algorithms.sft.dataset import SftCollator
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
|
||||
class InferDataset(torch.utils.data.Dataset):
|
||||
|
||||
def __init__(self, prompts_input_ids, responses_ids, message_data, metadata):
|
||||
super().__init__()
|
||||
|
||||
assert (
|
||||
len(prompts_input_ids)
|
||||
== len(responses_ids)
|
||||
== len(message_data)
|
||||
== len(metadata)
|
||||
), f"The length of prompts_input_ids, responses_ids, message_data, metadata should be the same, but got {len(prompts_input_ids)}, {len(responses_ids)}, {len(message_data)}, {len(metadata)}"
|
||||
self.prompts_input_ids = prompts_input_ids
|
||||
self.responses_ids = responses_ids
|
||||
self.message_data = message_data
|
||||
self.metadata = metadata
|
||||
|
||||
def __len__(self):
|
||||
return len(self.prompts_input_ids)
|
||||
|
||||
def __getitem__(self, item):
|
||||
|
||||
prompt_input_ids = self.prompts_input_ids[item]
|
||||
response_ids = self.responses_ids[item]
|
||||
num_prefill_tokens = len(prompt_input_ids)
|
||||
|
||||
input_ids = prompt_input_ids + response_ids
|
||||
labels = [-100] * (num_prefill_tokens - 1) + response_ids + [-100]
|
||||
|
||||
return {
|
||||
"input_ids": input_ids,
|
||||
"labels": labels,
|
||||
"num_tokens": len(input_ids),
|
||||
"message_data": self.message_data[item],
|
||||
"metadata": self.metadata[item],
|
||||
}
|
||||
|
||||
|
||||
class TrajectoryDataset(torch.utils.data.Dataset):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self._num_action_tokens = 0
|
||||
self._num_total_tokens = 0
|
||||
self._trajectories = []
|
||||
|
||||
@property
|
||||
def num_action_tokens(self):
|
||||
return self._num_action_tokens.item()
|
||||
|
||||
@property
|
||||
def num_total_tokens(self):
|
||||
return self._num_total_tokens
|
||||
|
||||
def update(self, trajectories):
|
||||
num_total_tokens = 0
|
||||
num_action_tokens = 0
|
||||
for data in trajectories:
|
||||
labels = np.array(data["labels"])
|
||||
num_total_tokens += labels.size
|
||||
num_action_tokens += (labels >= 0).sum()
|
||||
|
||||
self._num_action_tokens = num_action_tokens
|
||||
self._num_total_tokens = num_total_tokens
|
||||
|
||||
self._trajectories = trajectories
|
||||
|
||||
def dump_jsonl(self, path, tokenizer, debug=False):
|
||||
|
||||
with open(path, "w", encoding="utf8") as f:
|
||||
for data in self._trajectories:
|
||||
json_line = {
|
||||
"sequence": (
|
||||
data["sequence_text"]
|
||||
if "sequence_text" in data
|
||||
else tokenizer.decode(data["input_ids"])
|
||||
),
|
||||
"num_tokens": data["num_tokens"],
|
||||
}
|
||||
json_line["judger_reward"] = data["judger_reward"]
|
||||
json_line["judger_advantage"] = data["judger_advantage"]
|
||||
|
||||
if debug:
|
||||
json_line["input_ids"] = data["input_ids"]
|
||||
json_line["labels"] = data["labels"]
|
||||
|
||||
json_str = json.dumps(json_line, ensure_ascii=False)
|
||||
f.write(json_str + "\n")
|
||||
|
||||
def dump_log(self, path, tokenizer, debug=False):
|
||||
|
||||
with open(path, "w", encoding="utf8") as f:
|
||||
for data in self._trajectories:
|
||||
log_string = f"[sequence]:\n{data['sequence_text'] if 'sequence_text' in data else tokenizer.decode(data['input_ids'])}\n\n"
|
||||
log_string += f"[num_tokens]: {data['num_tokens']}\n"
|
||||
log_string += f"[judger_reward]: {data['judger_reward']}\n"
|
||||
log_string += f"[judger_advantage]: {data['judger_advantage']}\n"
|
||||
f.write(log_string + "\n\n=======================\n")
|
||||
|
||||
def __len__(self):
|
||||
return len(self._trajectories)
|
||||
|
||||
def __getitem__(self, item):
|
||||
|
||||
return self._trajectories[item]
|
||||
|
||||
|
||||
class TrajectoryDatasetWithFilter(TrajectoryDataset):
|
||||
def __init__(self, repeat_k=1, only_keep_1_pair=True):
|
||||
super().__init__()
|
||||
self.repeat_k = repeat_k
|
||||
self.only_keep_1_pair = only_keep_1_pair
|
||||
|
||||
def update(self, trajectories):
|
||||
# split trajectories into k groups: (a, a, b, b, c, c) -> [(a, a), (b, b), (c, c)]
|
||||
groups = [
|
||||
trajectories[i : i + self.repeat_k]
|
||||
for i in range(0, len(trajectories), self.repeat_k)
|
||||
]
|
||||
keeped_trajectories = []
|
||||
for group in groups:
|
||||
correctness = [1 if data["judger_reward"] == 1 else 0 for data in group]
|
||||
correct = [data for data in group if data["judger_reward"] == 1]
|
||||
incorrect = [data for data in group if data["judger_reward"] != 1]
|
||||
pass_rate = sum(correctness) / len(correctness)
|
||||
if self.only_keep_1_pair:
|
||||
if pass_rate == 1 or pass_rate == 0:
|
||||
continue
|
||||
# max keep 1 correct and 1 incorrect
|
||||
correct = random.choice(correct)
|
||||
incorrect = random.choice(incorrect)
|
||||
correct["pass_rate"] = pass_rate
|
||||
incorrect["pass_rate"] = pass_rate
|
||||
keeped_trajectories.append(correct)
|
||||
keeped_trajectories.append(incorrect)
|
||||
else:
|
||||
if pass_rate == 1 or pass_rate == 0:
|
||||
continue
|
||||
for data in group:
|
||||
data["pass_rate"] = pass_rate
|
||||
keeped_trajectories.append(data)
|
||||
|
||||
super().update(keeped_trajectories)
|
||||
|
||||
|
||||
class TrajectoryCollator(SftCollator):
|
||||
|
||||
def __call__(self, instances):
|
||||
|
||||
data = super().__call__(instances)
|
||||
data["judger_rewards"] = [item["judger_reward"] for item in instances]
|
||||
data["judger_advantages"] = [item["judger_advantage"] for item in instances]
|
||||
if "pass_rate" in instances[0]:
|
||||
data["pass_rate"] = [item["pass_rate"] for item in instances]
|
||||
return data
|
||||
Loading…
Add table
Add a link
Reference in a new issue