mirror of
https://github.com/lilakk/BLEUBERI.git
synced 2026-04-19 12:58:12 +00:00
22 lines
416 B
Python
22 lines
416 B
Python
from dataset import KeywordDataset
|
|
|
|
def create_dataset(
|
|
data_path,
|
|
split="train",
|
|
cache_dir="",
|
|
streaming=False,
|
|
shuffle=False,
|
|
load_from_disk=False,
|
|
tokenizer=None,
|
|
):
|
|
ds = KeywordDataset(data_path, tokenizer)
|
|
|
|
ds.load_dataset(
|
|
split,
|
|
cache_dir,
|
|
streaming=streaming,
|
|
shuffle=shuffle,
|
|
load_from_disk=load_from_disk
|
|
)
|
|
|
|
return ds
|