mirror of
https://github.com/InternLM/InternBootcamp.git
synced 2026-04-27 17:23:17 +00:00
init-commit
This commit is contained in:
commit
18a552597a
3461 changed files with 1150579 additions and 0 deletions
4
internbootcamp/bootcamp_utils/__init__.py
Executable file
4
internbootcamp/bootcamp_utils/__init__.py
Executable file
|
|
@ -0,0 +1,4 @@
|
|||
from .catch_print import *
|
||||
from .sampler import sample_sentences_by_percentage_distribution
|
||||
from .deduplicator import *
|
||||
from .jsonlines import *
|
||||
22
internbootcamp/bootcamp_utils/catch_print.py
Executable file
22
internbootcamp/bootcamp_utils/catch_print.py
Executable file
|
|
@ -0,0 +1,22 @@
|
|||
from contextlib import redirect_stdout
|
||||
import io
|
||||
|
||||
|
||||
def catch_print(func, **kwargs):
|
||||
output_capture = io.StringIO()
|
||||
with redirect_stdout(output_capture):
|
||||
result = func(**kwargs)
|
||||
output = output_capture.getvalue()
|
||||
output_capture.close()
|
||||
return output,result
|
||||
|
||||
if __name__ == '__main__':
|
||||
def greet(name):
|
||||
print(f"Hello, {name}!")
|
||||
return 42
|
||||
|
||||
# 使用 catch_print 函数
|
||||
output, result = catch_print(greet, name="Alice")
|
||||
|
||||
print("Captured Output:", output) # 输出: Captured Output: Hello, Alice!
|
||||
print("Function Result:", result) # 输出: Function Result: 42
|
||||
169
internbootcamp/bootcamp_utils/cipher_data_translator.py
Executable file
169
internbootcamp/bootcamp_utils/cipher_data_translator.py
Executable file
|
|
@ -0,0 +1,169 @@
|
|||
import os
|
||||
import random
|
||||
|
||||
from .jsonlines import read_jsonl, write_jsonl
|
||||
|
||||
|
||||
def to_ICL_encode(path, final_path):
|
||||
datas = read_jsonl(path)
|
||||
path = os.path.basename(path)
|
||||
output_path = os.path.join(final_path,f'icl_encode_{path}')
|
||||
results = []
|
||||
for data in datas:
|
||||
result = {
|
||||
"cipher_name": data['cipher_source'],
|
||||
"prompt": "",
|
||||
"input": data['decode_text'],
|
||||
"extra_args": data['extra_args'],
|
||||
"output": "",
|
||||
"ground_truth": data['encode_text']
|
||||
}
|
||||
ICL = []
|
||||
for _ in range(random.randint(1, 5)):
|
||||
# 使用random随机选择一个data
|
||||
random_data = random.choice(datas)
|
||||
while data == random_data:
|
||||
random_data = random.choice(datas)
|
||||
if data['extra_args']:
|
||||
ICL.append(f'明文: {random_data["plain"]} 密钥或额外参数: {random_data["extra_args"]} 加密成为密文: {random_data["encode_text"]}\n')
|
||||
else:
|
||||
ICL.append(f'明文: {random_data["plain"]} 加密成为密文: {random_data["encode_text"]}\n')
|
||||
|
||||
if data['extra_args']:
|
||||
ICL.append(f'明文: {data["plain"]} 密钥或额外参数: {data["extra_args"]} 加密成为密文: ? 一步一步完成')
|
||||
else:
|
||||
ICL.append(f"明文: {data['plain']} 加密成为密文: ? 一步一步完成")
|
||||
result["prompt"] += ''.join(ICL)
|
||||
result["output"] += data['encode_steps']
|
||||
results.append(result)
|
||||
write_jsonl(results, output_path)
|
||||
|
||||
|
||||
def to_ICL_decode(path, final_path):
|
||||
datas = read_jsonl(path)
|
||||
path = os.path.basename(path)
|
||||
output_path = os.path.join(final_path,f'icl_decode_{path}')
|
||||
results = []
|
||||
for data in datas:
|
||||
result = {
|
||||
"cipher_name": data['cipher_source'],
|
||||
"prompt": "",
|
||||
"input": data['encode_text'],
|
||||
"extra_args": data['extra_args'],
|
||||
"output": "",
|
||||
"ground_truth": data['decode_text']
|
||||
}
|
||||
ICL = []
|
||||
for _ in range(random.randint(1, 5)):
|
||||
# 使用random随机选择一个data
|
||||
random_data = random.choice(datas)
|
||||
while data == random_data:
|
||||
random_data = random.choice(datas)
|
||||
if data['extra_args']:
|
||||
ICL.append(f'密文: {random_data["encode_text"]} 密钥或额外参数: {random_data["extra_args"]} 解密成为明文: {random_data["plain"]}\n')
|
||||
else:
|
||||
ICL.append(f'密文: {random_data["encode_text"]} 解密成为明文: {random_data["plain"]}\n')
|
||||
if data['extra_args']:
|
||||
ICL.append(f'密文: {data["encode_text"]} 密钥或额外参数: {data["extra_args"]} 解密成为明文: ? 一步一步完成')
|
||||
else:
|
||||
ICL.append(f"密文: {data['encode_text']} 解密成为明文: ? 一步一步完成")
|
||||
result["prompt"] += ''.join(ICL)
|
||||
result["output"] += data['decode_steps']
|
||||
results.append(result)
|
||||
write_jsonl(results, output_path)
|
||||
|
||||
|
||||
def to_ICL_with_rule_encode(path, final_path):
|
||||
datas = read_jsonl(path)
|
||||
path = os.path.basename(path)
|
||||
output_path = os.path.join(final_path,f'icl_with_rule_encode_{path}')
|
||||
results = []
|
||||
for data in datas:
|
||||
result = {
|
||||
"cipher_name": data['cipher_source'],
|
||||
"prompt": f"请根据加密算法对明文进行加密\n{data['encode_rule']}\n",
|
||||
"input": data['decode_text'],
|
||||
"extra_args": data['extra_args'],
|
||||
"output": "",
|
||||
"ground_truth": data['encode_text']
|
||||
}
|
||||
ICL = []
|
||||
for _ in range(random.randint(0, 3)):
|
||||
# 使用random随机选择一个data
|
||||
random_data = random.choice(datas)
|
||||
while data == random_data:
|
||||
random_data = random.choice(datas)
|
||||
if data['extra_args']:
|
||||
ICL.append(f'明文: {random_data["plain"]} 密钥或额外参数: {random_data["extra_args"]} 加密成为密文: {random_data["encode_text"]}\n')
|
||||
else:
|
||||
ICL.append(f'明文: {random_data["plain"]} 加密成为密文: {random_data["encode_text"]}\n')
|
||||
if data['extra_args']:
|
||||
ICL.append(f'明文: {data["plain"]} 密钥或额外参数: {data["extra_args"]} 加密成为密文: ? 一步一步完成')
|
||||
else:
|
||||
ICL.append(f"明文: {data['plain']} 加密成为密文: ? 一步一步完成")
|
||||
result["prompt"] += ''.join(ICL)
|
||||
result["output"] += data['encode_steps']
|
||||
results.append(result)
|
||||
write_jsonl(results, output_path)
|
||||
|
||||
|
||||
def to_ICL_with_rule_decode(path, final_path):
|
||||
datas = read_jsonl(path)
|
||||
path = os.path.basename(path)
|
||||
output_path = os.path.join(final_path,f'icl_with_rule_decode_{path}')
|
||||
results = []
|
||||
for data in datas:
|
||||
result = {
|
||||
"cipher_name": data['cipher_source'],
|
||||
"prompt": f"请根据加密算法和解密算法对密码进行解密\n加密算法:\n{data['encode_rule']}\n解密算法:\n{data['decode_rule']}\n",
|
||||
"input": data['encode_text'],
|
||||
"extra_args": data['extra_args'],
|
||||
"output": "",
|
||||
"ground_truth": data['decode_text']
|
||||
}
|
||||
ICL = []
|
||||
for _ in range(random.randint(0, 3)):
|
||||
# 使用random随机选择一个data
|
||||
random_data = random.choice(datas)
|
||||
while data == random_data:
|
||||
random_data = random.choice(datas)
|
||||
if data['extra_args']:
|
||||
ICL.append(f'密文: {random_data["encode_text"]} 密钥或额外参数: {random_data["extra_args"]} 解密成为明文: {random_data["plain"]}\n')
|
||||
else:
|
||||
ICL.append(f'密文: {random_data["encode_text"]} 解密成为明文: {random_data["plain"]}\n')
|
||||
if data['extra_args']:
|
||||
ICL.append(f'密文: {data["encode_text"]} 密钥或额外参数: {data["extra_args"]} 解密成为明文: ? 一步一步完成')
|
||||
else:
|
||||
ICL.append(f"密文: {data['encode_text']} 解密成为明文: ? 一步一步完成")
|
||||
result["prompt"] += ''.join(ICL)
|
||||
result["output"] += data['decode_steps']
|
||||
results.append(result)
|
||||
write_jsonl(results, output_path)
|
||||
def process_path(path, final_path):
|
||||
to_ICL_encode(path, final_path)
|
||||
to_ICL_with_rule_encode(path, final_path)
|
||||
to_ICL_decode(path, final_path)
|
||||
to_ICL_with_rule_decode(path, final_path)
|
||||
|
||||
def translate(origin_data_path,final_data_path):
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from tqdm import tqdm
|
||||
|
||||
# 获取文件列表
|
||||
train_list = [os.path.join(f'{origin_data_path}', puzzle) for puzzle in os.listdir(f'{origin_data_path}')]
|
||||
|
||||
# 使用上下文管理器确保线程池正确关闭
|
||||
with ThreadPoolExecutor(max_workers=5) as executor: # 设置最大线程数为5
|
||||
# 提交所有任务到线程池,并获取future对象
|
||||
futures = {executor.submit(process_path, path, final_data_path): path for path in train_list}
|
||||
|
||||
# 遍历完成的任务,并用tqdm创建进度条
|
||||
for future in tqdm(as_completed(futures), total=len(train_list), desc='cipher data translating'):
|
||||
path = futures[future]
|
||||
try:
|
||||
# 这里可以处理每个任务的结果,如果需要的话
|
||||
result = future.result()
|
||||
except Exception as exc:
|
||||
print(f'Path {path} generated an exception: {exc}')
|
||||
|
||||
|
||||
276
internbootcamp/bootcamp_utils/cipher_prompt_enhance.py
Executable file
276
internbootcamp/bootcamp_utils/cipher_prompt_enhance.py
Executable file
|
|
@ -0,0 +1,276 @@
|
|||
import json
|
||||
import os
|
||||
import random
|
||||
import shutil
|
||||
import tempfile
|
||||
import traceback
|
||||
|
||||
def walk_dir(dir_path, func):
|
||||
for root, dirs, files in os.walk(dir_path, topdown=False):
|
||||
for file in files:
|
||||
func(os.path.join(root, file))
|
||||
|
||||
from tqdm import tqdm
|
||||
from openai import OpenAI
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from internbootcamp.bootcamp.cipher.cipher_default import Cipherbootcamp
|
||||
|
||||
def walk_dir_multithreaded(dir_path, func, max_workers=32,**kwargs):
|
||||
# 首先统计文件总数
|
||||
file_count = sum([len(files) for _, _, files in os.walk(dir_path)])
|
||||
|
||||
# 收集所有文件路径
|
||||
file_paths = [os.path.join(root, name) for root, _, files in os.walk(dir_path) for name in files]
|
||||
|
||||
# 使用ThreadPoolExecutor来并行处理文件
|
||||
with tqdm(total=file_count, desc='cipher data enhancing', unit='file') as pbar:
|
||||
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||
futures = {executor.submit(func, file_path, **kwargs): file_path for file_path in file_paths}
|
||||
for future in as_completed(futures):
|
||||
try:
|
||||
result = future.result() # 获取任务结果
|
||||
except Exception as exc:
|
||||
print("File processing generated an exception:")
|
||||
traceback.print_exc()
|
||||
pbar.update(1) # 每完成一个任务,更新进度条
|
||||
# 注意:func 应该是线程安全的,并且不会引起竞争条件。
|
||||
|
||||
def read_jsonl(file_path):
|
||||
res = []
|
||||
with open(file_path, 'r') as f:
|
||||
for line in f:
|
||||
res.append(json.loads(line))
|
||||
return res
|
||||
|
||||
def get_refine_rule(rule:str, cn:bool,refined_rules):
|
||||
res = []
|
||||
for rule_dict in refined_rules:
|
||||
if rule_dict['rule'] == rule:
|
||||
res.extend(rule_dict['replace'] if cn else rule_dict['replace_en'])
|
||||
try:
|
||||
return random.choice(res)
|
||||
except:
|
||||
return None
|
||||
|
||||
def prompt_enhance(prompt:str, decode:bool, cn:bool, has_rule:bool,refined_rules):
|
||||
if has_rule:
|
||||
if '\n明文:' in prompt:
|
||||
rule = prompt.split('\n明文:')[0]
|
||||
elif '\n密文:' in prompt:
|
||||
rule = prompt.split('\n密文:')[0]
|
||||
else:
|
||||
print('rule not found')
|
||||
refine_rule = get_refine_rule(rule,cn=cn,refined_rules=refined_rules)
|
||||
if not refine_rule:
|
||||
return None
|
||||
prompt = prompt.replace(rule, refine_rule)
|
||||
rep_dict_cn = {
|
||||
'文: ?': random.choice(['文: ?','文是什么?',]),
|
||||
'明文': random.choice(['明文','原文','原始信息','初始文本', '非加密信息']),
|
||||
'密文': random.choice(['密文','暗文','加密信息','加密文本','暗码','隐文']),
|
||||
'加密成为':random.choice(['加密成为','加密为','编码为', '加密成', ]),
|
||||
'解密成为':random.choice(['解密成为','解密为','解码为', '解密成', ]),
|
||||
'密钥或额外参数: ': random.choice(['密钥或额外参数: ','密钥: ','额外参数: ']),
|
||||
'一步一步完成': random.choice(['','一步一步完成','请一步一步完成, 制定合理的解题计划并严格执行。','请规划流程一步步实施,确保每一步都经过仔细检查,最终达到预期效果。','请细心地依照步骤行动,确保过程中的每个细节都不被忽视,以达成准确无误的目标。','请一步一步完成,确保过程详细严谨,结果正确。','精心完成每一步。',])
|
||||
}
|
||||
rep_dict_en = {
|
||||
'文: ?': random.choice(['文: ?','文 is: ?',]),
|
||||
'明文': random.choice(['plain text','original information','clear text']),
|
||||
'密文': random.choice(['cipher text','encrypted text','encoded text']),
|
||||
'加密成为':random.choice(['encrypt into ','encrypt to ','encode into ','encode to ']),
|
||||
'解密成为':random.choice(['decrypt into ','decrypt to ','decode into ','decode to ']),
|
||||
'密钥或额外参数: ': random.choice(['secret key or extra parameter: ','secret key: ','extra parameter: ']),
|
||||
'一步一步完成': random.choice(['','step by step','Please complete it step by step, formulate a reasonable problem-solving plan, and strictly adhere to it.','Please plan the process and implement it step by step, ensuring that each step is carefully checked to ultimately achieve the desired outcome.','Please proceed carefully according to the steps, ensuring that every detail in the process is not overlooked, to achieve an accurate and error-free goal.',' Carefully complete each step.','Please complete it step by step, ensuring the process is detailed and rigorous, and the result is correct.'])
|
||||
|
||||
}
|
||||
if cn:
|
||||
rep_dict = rep_dict_cn
|
||||
else:
|
||||
rep_dict = rep_dict_en
|
||||
for k, v in rep_dict.items():
|
||||
prompt = prompt.replace(k, v)
|
||||
|
||||
insert_list_encode = ['','您是一位杰出的密文加密专家,请参考以下案例和信息进行加密操作。',
|
||||
'作为编码器,您的任务是依据给出的案例中的加密算法,将明文加密为密文。',
|
||||
'作为密码学领域的专家,您需要运用您的专业知识,分析案例中的加密算法,并对数据实施加密处理。',
|
||||
'您的任务是使用相应的算法将敏感信息转换为不可读的形式,以保障其传输过程中的安全性。',
|
||||
'运用您的专业技能,将提供的数据通过加密算法转换为安全的密文形式,是您的主要职责。',
|
||||
'凭借您在密码学方面的深厚造诣,您的工作是分析并应用案例中的加密技术,确保信息在传输过程中不会被非法截获。',]
|
||||
insert_list_decode = ['','您是一位杰出的密文解密专家,请参考以下案例和信息进行解密操作。',
|
||||
'作为解码大师,您的任务是依据案例中描述的解密算法,将密文还原为原始的明文。',
|
||||
'作为密码学领域的专家,您需要运用您的专业知识,分析案例中的加密算法,并对数据实施解密处理。',
|
||||
'您的任务是使用正确的算法将看似无意义的密文转换回可读的原始信息,确保信息的准确性和完整性。',
|
||||
'您的主要职责是运用您的专业技能,将提供的密文通过恰当的解密算法恢复成最初的数据形式。',
|
||||
'凭借您在密码学方面的深厚造诣,您的工作是分析并应用案例中的加密技术,确保信息在传输过程中不会被非法截获。',
|
||||
]
|
||||
insert_list_decode_en = ['','You are an excellent cipher decoder, please refer to the following examples and information to decode the ciphertext.',
|
||||
'As a decoder, your task is to use the encryption algorithm described in the examples to decrypt the ciphertext.',
|
||||
'As a specialist in cryptography, your job is to analyze the encryption algorithm in the examples and implement the decryption process on the data.',
|
||||
'Your task is to convert the seemingly meaningless ciphertext into readable information using the appropriate algorithm, ensuring the accuracy and integrity of the information.',
|
||||
'Your primary responsibility is to use your professional skills to decode the provided ciphertext using the correct algorithm and ensure the accuracy and integrity of the information.',
|
||||
'By your deep knowledge in cryptography, your work is to analyze and apply the encryption techniques in the examples, ensuring the security of information during transmission.',
|
||||
'Please decode the ciphertext according to the examples and the given information.'
|
||||
]
|
||||
insert_list_encode_en = ['','You are an excellent cipher encoder, please refer to the following examples and information to encode the plaintext.',
|
||||
'As an encoder, your task is to use the encryption algorithm described in the examples to encrypt the plaintext.',
|
||||
'As a specialist in cryptography, your job is to analyze the encryption algorithm in the examples and implement the encryption process on the data.',
|
||||
'Your task is to convert the plaintext into an unreadable form usingthe appropriate algorithm, ensuring the security of the information during transmission.',
|
||||
'Your primary responsibility is to use your professional skills to encode the provided plaintext using the correct algorithm and ensure the security of information during transmission.',
|
||||
'By your deep knowledge in cryptography, your work is to analyze and apply the encryption techniques in the examples, ensuring the security of information during transmission.',
|
||||
'Please encode the plaintext step by step, ensuring the process is detailed and rigorous, and the result is correct.'
|
||||
]
|
||||
if decode:
|
||||
insert_prompt = random.choice(insert_list_decode if cn else insert_list_decode_en)
|
||||
else:
|
||||
insert_prompt = random.choice(insert_list_encode if cn else insert_list_encode_en)
|
||||
return Cipherbootcamp.prompt_func(insert_prompt + '\n' + prompt)
|
||||
|
||||
def substring_from_keyword(s, keyword):
|
||||
# 查找关键词在字符串中的位置
|
||||
index = s.find(keyword)
|
||||
|
||||
# 如果找到了关键词,并且它不是位于字符串的开始处
|
||||
if index != -1 and index != 0:
|
||||
# 从关键词的起始位置开始截取子串,包括关键词本身
|
||||
return s[index:]
|
||||
else:
|
||||
# 如果没有找到关键词或者关键词位于字符串开始,则返回原始字符串
|
||||
return s
|
||||
|
||||
def make_rules_sure(path):
|
||||
def get_rule(file_path):
|
||||
if 'with_rule' not in file_path:
|
||||
return
|
||||
with open(file_path, 'r') as f:
|
||||
lines = f.readlines()
|
||||
line = json.loads(lines[0])
|
||||
prompt = line['prompt']
|
||||
if '\n明文:' in prompt:
|
||||
rule = prompt.split('\n明文:')[0]
|
||||
elif '\n密文:' in prompt:
|
||||
rule = prompt.split('\n密文:')[0]
|
||||
else:
|
||||
print('rule not found')
|
||||
save_rule(rule)
|
||||
|
||||
def save_rule(rule):
|
||||
with open('internbootcamp/libs/data/cipher_rules.jsonl', 'a') as f:
|
||||
# 检测rule是否已存在
|
||||
if rule in [json.loads(line)['rule'] for line in open('internbootcamp/libs/data/cipher_rules.jsonl', 'r')]:
|
||||
# print('rule already exists')
|
||||
return
|
||||
line = json.dumps({'rule': rule,'replace': [rule,],'replace_en': []}, ensure_ascii=False)
|
||||
f.write(line)
|
||||
f.write('\n')
|
||||
|
||||
def get_rule_en(rule):
|
||||
prompt = f"Your task is to rewrite the given text into English. Do not delete the information given in the given text, any information cannot be omitted or deleted.. <Given Text>{rule}</Given Text>. Please only return the text you wrote as synonymous text"
|
||||
return call_model(prompt,base_url='http://127.0.0.1:8000/v1',model='qwen2_5_72B_instruct')
|
||||
|
||||
|
||||
def call_model(prompt:str, model:str, base_url:str ,history_list:list[dict] = None,sysprompt:str = None,temperature = 0.001, top_p=0.8, max_tokens=4096, api_key = 'EMPTY'):
|
||||
client = OpenAI(base_url=base_url,api_key=api_key)
|
||||
messages = []
|
||||
if sysprompt:
|
||||
messages.append({"role": "system", "content": sysprompt})
|
||||
messages.extend(history_list + [{"role": "user", "content": prompt}] if history_list else [{"role": "user", "content": prompt}])
|
||||
response = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=messages,
|
||||
max_tokens=max_tokens,
|
||||
temperature=temperature,
|
||||
top_p=top_p,
|
||||
n=1,
|
||||
# timeout=120,
|
||||
)
|
||||
return response.choices[0].message.content
|
||||
|
||||
def translate_rule(rule_path):
|
||||
# Read all rules into a list
|
||||
with open(rule_path, 'r') as f:
|
||||
rules = [json.loads(line) for line in f.readlines()]
|
||||
|
||||
# Process each rule and update the replace_en field if necessary
|
||||
for rule in tqdm(rules, desc='Processing rules', unit='rule'):
|
||||
if not rule.get('replace_en'): # Ensure the key exists and is not empty
|
||||
rule['replace_en'] = [] # Initialize the list if it doesn't exist
|
||||
rule_en = get_rule_en(rule['rule'])
|
||||
rule['replace_en'].append(rule_en)
|
||||
|
||||
# Write the updated rules back to the file
|
||||
temp_file_path = tempfile.mkstemp()[1]
|
||||
try:
|
||||
with open(temp_file_path, 'w') as f:
|
||||
for rule in rules:
|
||||
line = json.dumps(rule, ensure_ascii=False)
|
||||
f.write(line + '\n')
|
||||
# Replace the original file with the updated one
|
||||
shutil.move(temp_file_path, rule_path)
|
||||
except Exception as e:
|
||||
print(f"An error occurred: {e}")
|
||||
# Clean up the temporary file in case of an error
|
||||
if os.path.exists(temp_file_path):
|
||||
os.remove(temp_file_path)
|
||||
|
||||
walk_dir_multithreaded(path,get_rule)
|
||||
translate_rule('internbootcamp/libs/data/cipher_rules.jsonl')
|
||||
|
||||
def main(path,outputDir,refined_rules):
|
||||
jsonl = read_jsonl(path)
|
||||
decode = True if 'decode' in path else False
|
||||
cn = True if 'cn' in path else False
|
||||
has_rule = True if 'with_rule' in path else False
|
||||
if '_Kor_' in path and not has_rule:
|
||||
# kor cipher 跳过 without rule
|
||||
return
|
||||
res = []
|
||||
for line in jsonl:
|
||||
enhanced_prompt = prompt_enhance(line['prompt'],decode=decode,cn=cn,has_rule=has_rule,refined_rules=refined_rules)
|
||||
if enhanced_prompt:
|
||||
line['prompt'] = enhanced_prompt.strip()
|
||||
res.append(line)
|
||||
else:
|
||||
print(f"data {path} enhanced prompt is empty, skip")
|
||||
break
|
||||
|
||||
# line['refine_output'] = substring_from_keyword(line['refine_output'].strip(), '<restate>')
|
||||
if not res:
|
||||
return
|
||||
path_base = os.path.basename(path)
|
||||
with open(os.path.join(outputDir,path_base), 'w') as f:
|
||||
for line in res:
|
||||
f.write(json.dumps(line, ensure_ascii=False) + '\n')
|
||||
|
||||
def to_cn_en(src_dir, dest_dir):
|
||||
if not os.path.exists(dest_dir):
|
||||
os.makedirs(dest_dir)
|
||||
# 遍历源目录下的所有文件
|
||||
for filename in os.listdir(src_dir):
|
||||
if filename.endswith('.jsonl'):
|
||||
file_path = os.path.join(src_dir, filename)
|
||||
|
||||
# 如果是.py文件,创建cn.py和en.py两个副本
|
||||
base_name, ext = os.path.splitext(filename)
|
||||
cn_file = f"{base_name}_cn{ext}"
|
||||
en_file = f"{base_name}_en{ext}"
|
||||
|
||||
# 定义新的文件路径
|
||||
cn_dest_path = os.path.join(dest_dir, cn_file)
|
||||
en_dest_path = os.path.join(dest_dir, en_file)
|
||||
|
||||
# 复制文件到目标目录
|
||||
shutil.copy2(file_path, cn_dest_path) # 使用copy2来保留元数据
|
||||
shutil.copy2(file_path, en_dest_path)
|
||||
|
||||
def enhance(refined_rules,inputDir,outputDir):
|
||||
make_rules_sure(inputDir)
|
||||
to_cn_en(inputDir,inputDir+'_tmp')
|
||||
|
||||
if not os.path.exists(outputDir):
|
||||
# print(f'Warning: {outputDir} does not exist. Creating...')
|
||||
os.makedirs(outputDir)
|
||||
walk_dir_multithreaded(inputDir+'_tmp', main ,outputDir = outputDir, refined_rules = read_jsonl(refined_rules))
|
||||
try:
|
||||
shutil.rmtree(inputDir+'_tmp')
|
||||
except:
|
||||
pass
|
||||
73
internbootcamp/bootcamp_utils/deduplicator.py
Executable file
73
internbootcamp/bootcamp_utils/deduplicator.py
Executable file
|
|
@ -0,0 +1,73 @@
|
|||
import json
|
||||
import os
|
||||
def deduplicate_jsonl_by_field(input_file_path, output_file_path=None,field_name='id'):
|
||||
seen_ids = set()
|
||||
unique_entries = []
|
||||
|
||||
# Read the input file and collect unique entries based on 'id'
|
||||
with open(input_file_path, 'r', encoding='utf-8') as infile:
|
||||
for line in infile:
|
||||
try:
|
||||
entry = json.loads(line)
|
||||
entry_id = entry.get(field_name)
|
||||
if entry_id is not None and entry_id not in seen_ids:
|
||||
seen_ids.add(entry_id)
|
||||
unique_entries.append(entry)
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"Error decoding JSON: {e}")
|
||||
|
||||
# Write unique entries to the output file
|
||||
if not output_file_path:
|
||||
output_file_path = input_file_path
|
||||
with open(output_file_path, 'w', encoding='utf-8') as outfile:
|
||||
for entry in unique_entries:
|
||||
outfile.write(json.dumps(entry, ensure_ascii=False) + '\n')
|
||||
|
||||
print(f"Deduplication complete. {len(unique_entries)} unique entries written to {output_file_path}")
|
||||
|
||||
|
||||
def get_difference_optimized(file1, file2, key_field, output_file):
|
||||
"""
|
||||
根据key_field字段计算两个JSON Lines文件的差集,并返回结果。
|
||||
|
||||
参数:
|
||||
file1: 第一个JSON Lines文件路径(基准文件)
|
||||
file2: 第二个JSON Lines文件路径(要从中减去的文件)
|
||||
key_field: 用于比较的字段名
|
||||
output_file: 差集结果的文件路径
|
||||
|
||||
返回:
|
||||
差集结果的文件路径
|
||||
"""
|
||||
if not os.path.exists(file2):
|
||||
return output_file
|
||||
# 收集两个文件中所有的key值
|
||||
keys_in_file1 = set()
|
||||
records1 = []
|
||||
with open(file1, 'r', encoding='utf-8') as f1:
|
||||
for line in f1:
|
||||
record = json.loads(line)
|
||||
records1.append(record)
|
||||
keys_in_file1.add(record[key_field])
|
||||
|
||||
keys_in_file2 = set()
|
||||
with open(file2, 'r', encoding='utf-8') as f2:
|
||||
for line in f2:
|
||||
record = json.loads(line)
|
||||
keys_in_file2.add(record[key_field])
|
||||
|
||||
# 计算file1中独有的key值
|
||||
unique_keys = keys_in_file1 - keys_in_file2
|
||||
|
||||
# 构建差集记录列表
|
||||
difference_records = [record for record in records1 if record[key_field] in unique_keys]
|
||||
|
||||
# 将差集写入新的JSON Lines文件
|
||||
def write_json_lines(file_path, records):
|
||||
"""将记录列表写入JSON Lines文件"""
|
||||
with open(file_path, 'w', encoding='utf-8') as file:
|
||||
for record in records:
|
||||
file.write(json.dumps(record, ensure_ascii=False) + '\n')
|
||||
write_json_lines(output_file, difference_records)
|
||||
|
||||
return output_file
|
||||
5
internbootcamp/bootcamp_utils/formatted_time.py
Executable file
5
internbootcamp/bootcamp_utils/formatted_time.py
Executable file
|
|
@ -0,0 +1,5 @@
|
|||
import time
|
||||
|
||||
def formatted_time():
|
||||
formatted_time = time.strftime("%Y%m%d_%H%M%S", time.localtime())
|
||||
return formatted_time
|
||||
83
internbootcamp/bootcamp_utils/jsonlines.py
Executable file
83
internbootcamp/bootcamp_utils/jsonlines.py
Executable file
|
|
@ -0,0 +1,83 @@
|
|||
import json
|
||||
|
||||
|
||||
def append_json_lines(source_file, target_file):
|
||||
"""
|
||||
将source_file中的所有JSON Lines记录追加到target_file中。
|
||||
|
||||
参数:
|
||||
source_file: 要追加的JSON Lines文件路径
|
||||
target_file: 目标JSON Lines文件路径,新的记录将被追加到这里
|
||||
"""
|
||||
# 使用 'a' 模式打开目标文件以进行追加写入
|
||||
with open(target_file, 'a', encoding='utf-8') as outfile:
|
||||
# 打开源文件并逐行读取其内容
|
||||
with open(source_file, 'r', encoding='utf-8') as infile:
|
||||
for line in infile:
|
||||
outfile.write(line)
|
||||
|
||||
def read_jsonl(path, encoding='utf-8'):
|
||||
"""
|
||||
Reads a jsonl file and returns a list of dictionaries.
|
||||
"""
|
||||
with open(path, 'r', encoding=encoding) as f:
|
||||
lines = f.readlines()
|
||||
return [json.loads(line) for line in lines]
|
||||
|
||||
|
||||
def add_jsonl(data, path, encoding='utf-8'):
|
||||
"""
|
||||
add a dictionary to a jsonl file.
|
||||
"""
|
||||
with open(path, 'a', encoding=encoding) as f:
|
||||
f.write(json.dumps(data, ensure_ascii=False)+'\n')
|
||||
|
||||
def write_jsonl(data, path, encoding='utf-8'):
|
||||
"""
|
||||
Write a list of dictionaries to a jsonl file.
|
||||
"""
|
||||
with open(path, 'w', encoding=encoding) as f:
|
||||
for line in data:
|
||||
f.write(json.dumps(line, ensure_ascii=False)+'\n')
|
||||
|
||||
def extend_jsonl(datas, path, encoding='utf-8'):
|
||||
"""
|
||||
Extend a jsonl file with a list of dictionaries
|
||||
"""
|
||||
with open(path, 'a', encoding=encoding) as f:
|
||||
for line in datas:
|
||||
f.write(json.dumps(line, ensure_ascii=False)+'\n')
|
||||
|
||||
import os
|
||||
from itertools import count
|
||||
|
||||
|
||||
def merge_jsonlines(input_directory, output_file):
|
||||
id_counter = count(start=1)
|
||||
|
||||
with open(output_file, 'w', encoding='utf-8') as outfile:
|
||||
# 使用 os.walk 遍历目录及其子目录
|
||||
for root, dirs, files in os.walk(input_directory):
|
||||
for filename in files:
|
||||
if filename.endswith('.jsonl'):
|
||||
file_path = os.path.join(root, filename)
|
||||
with open(file_path, 'r', encoding='utf-8') as infile:
|
||||
for line in infile:
|
||||
try:
|
||||
record = json.loads(line)
|
||||
# 添加新的字段到记录中
|
||||
new_record = {}
|
||||
new_record['id'] = next(id_counter)
|
||||
new_record['source_filename'] = os.path.relpath(file_path, input_directory)
|
||||
for key, value in record.items():
|
||||
new_record[key] = value
|
||||
# 将更新后的记录写入输出文件
|
||||
outfile.write(json.dumps(new_record, ensure_ascii=False) + '\n')
|
||||
except json.JSONDecodeError:
|
||||
print(f"Warning: Could not decode line in {filename}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
directory = 'data_generator_outputs/_Eval/20250114_153504/easy'
|
||||
output_file = 'data_generator_outputs/_Eval/20250114_153504/easy_test.jsonl'
|
||||
merge_jsonlines(directory, output_file)
|
||||
13
internbootcamp/bootcamp_utils/random_things.py
Executable file
13
internbootcamp/bootcamp_utils/random_things.py
Executable file
|
|
@ -0,0 +1,13 @@
|
|||
import random
|
||||
with open('./internbootcamp/libs/data/words_alpha_370000.txt', 'r') as f:
|
||||
words = f.readlines()
|
||||
words = [w.strip() for w in words]
|
||||
def random_word():
|
||||
w = random.choice(words)
|
||||
while len(w) < 3:
|
||||
w = random.choice(words)
|
||||
return w
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
print(random_word())
|
||||
94
internbootcamp/bootcamp_utils/sampler.py
Executable file
94
internbootcamp/bootcamp_utils/sampler.py
Executable file
|
|
@ -0,0 +1,94 @@
|
|||
import json
|
||||
import random
|
||||
|
||||
def sample_sentences_by_percentage_distribution(sentences_list, percentage_distribution={}, total_samples=1000):
|
||||
"""
|
||||
根据指定的百分比分布从句子列表中抽样句子。
|
||||
|
||||
参数:
|
||||
sentences_list: 包含句子的列表,每个元素是一个字符串。
|
||||
percentage_distribution: 一个字典,键为元组表示长度范围(包含),值为该范围内需要抽样的句子的百分比。
|
||||
total_samples: 总共需要抽样的句子数量。
|
||||
|
||||
返回:
|
||||
sampled_sentences: 一个列表,包含按照指定百分比分布抽样的句子。
|
||||
"""
|
||||
if not percentage_distribution:
|
||||
percentage_distribution = {
|
||||
(0, 10): 0.7,
|
||||
(11, 20): 0.15,
|
||||
(21, 30): 0.10,
|
||||
(31, 60): 0.05,
|
||||
}
|
||||
|
||||
# 计算每个长度范围应抽取的句子数量
|
||||
length_to_sample = {
|
||||
length_range: max(0, int(total_samples * percentage))
|
||||
for length_range, percentage in percentage_distribution.items()
|
||||
}
|
||||
|
||||
# 创建一个字典来保存按长度范围分组的句子
|
||||
grouped_sentences = {}
|
||||
for sentence in sentences_list:
|
||||
sentence_length = len(sentence)
|
||||
for (min_len, max_len) in length_to_sample.keys():
|
||||
if min_len <= sentence_length <= max_len:
|
||||
if (min_len, max_len) not in grouped_sentences:
|
||||
grouped_sentences[(min_len, max_len)] = []
|
||||
grouped_sentences[(min_len, max_len)].append(sentence)
|
||||
break
|
||||
|
||||
# 抽样
|
||||
sampled_sentences = []
|
||||
for (min_len, max_len), n in length_to_sample.items():
|
||||
filtered_sentences = grouped_sentences.get((min_len, max_len), [])
|
||||
|
||||
# 检查是否有足够的句子可供抽样
|
||||
if len(filtered_sentences) < n:
|
||||
print(f"警告:长度在{min_len}到{max_len}之间的句子不足{n}个,只有{len(filtered_sentences)}个。")
|
||||
n = len(filtered_sentences)
|
||||
|
||||
# 如果有足够多的句子,则进行抽样;否则取全部
|
||||
if n > 0:
|
||||
sampled = random.sample(filtered_sentences, n)
|
||||
sampled_sentences.extend(sampled)
|
||||
|
||||
# 如果抽样总数少于要求的总数,打印警告
|
||||
if len(sampled_sentences) < total_samples:
|
||||
print(f"Warning: Only {len(sampled_sentences)} sentences were sampled.")
|
||||
|
||||
# 打乱最终的句子列表以保持随机性
|
||||
random.shuffle(sampled_sentences)
|
||||
|
||||
return sampled_sentences[:total_samples]
|
||||
|
||||
def downsample_dict_list_by_unique_field(data_list, unique_field):
|
||||
"""
|
||||
对输入的字典列表进行降采样,确保指定字段唯一。
|
||||
|
||||
参数:
|
||||
data_list (list): 包含字典的列表,每个字典中必须包含 unique_field 指定的字段。
|
||||
unique_field (str): 需要保证唯一的字段名。
|
||||
|
||||
返回:
|
||||
list: 降采样后的字典列表,保证指定字段的值只出现一次。
|
||||
"""
|
||||
if not isinstance(data_list, list):
|
||||
raise ValueError("输入必须是一个列表")
|
||||
if not isinstance(unique_field, str):
|
||||
raise ValueError("unique_field 必须是字符串类型")
|
||||
|
||||
seen_values = set() # 用于记录已经处理过的字段值
|
||||
result = [] # 存储降采样后的结果
|
||||
|
||||
for item in data_list:
|
||||
if unique_field not in item:
|
||||
raise KeyError(f"字典中缺少 '{unique_field}' 字段")
|
||||
|
||||
field_value = item[unique_field]
|
||||
if field_value not in seen_values:
|
||||
# 如果该字段值尚未处理过,添加到结果并标记为已处理
|
||||
result.append(item)
|
||||
seen_values.add(field_value)
|
||||
|
||||
return result
|
||||
Loading…
Add table
Add a link
Reference in a new issue