mirror of
https://github.com/lilakk/BLEUBERI.git
synced 2026-04-19 12:58:12 +00:00
84 lines
2.7 KiB
Python
84 lines
2.7 KiB
Python
import pandas as pd
|
|
import re
|
|
import argparse
|
|
|
|
from tqdm import tqdm
|
|
|
|
tqdm.pandas()
|
|
|
|
|
|
def count_markdown_elements(markdown_text, suffix):
|
|
counters = {
|
|
f"header_count{suffix}": {
|
|
"h1": len(re.findall(r"^#{1}\s", markdown_text, re.MULTILINE)),
|
|
"h2": len(re.findall(r"^#{2}\s", markdown_text, re.MULTILINE)),
|
|
"h3": len(re.findall(r"^#{3}\s", markdown_text, re.MULTILINE)),
|
|
"h4": len(re.findall(r"^#{4}\s", markdown_text, re.MULTILINE)),
|
|
"h5": len(re.findall(r"^#{5}\s", markdown_text, re.MULTILINE)),
|
|
"h6": len(re.findall(r"^#{6}\s", markdown_text, re.MULTILINE)),
|
|
},
|
|
f"list_count{suffix}": {
|
|
"ordered": len(re.findall(r"^\s*\d+\.\s", markdown_text, re.MULTILINE)),
|
|
"unordered": len(re.findall(r"^\s*[-*+]\s", markdown_text, re.MULTILINE)),
|
|
},
|
|
f"bold_count{suffix}": {
|
|
"**": len(re.findall(r"\*\*[^*\n]+\*\*", markdown_text)),
|
|
"__": len(re.findall(r"__[^_\n]+__", markdown_text)),
|
|
},
|
|
}
|
|
return counters
|
|
|
|
|
|
def remove_pattern(answer, pattern):
|
|
blocks = pattern.findall(answer)
|
|
for block in blocks:
|
|
answer = answer.replace(block, "")
|
|
return answer
|
|
|
|
|
|
def get_element_counts(df, column):
|
|
pattern = re.compile("```([^`]*)```")
|
|
answers = df[column].map(
|
|
lambda convo: "\n".join(
|
|
[turn["content"] for turn in convo if turn["role"] == "assistant"]
|
|
)
|
|
)
|
|
results = answers.progress_map(
|
|
lambda answer: count_markdown_elements(
|
|
remove_pattern(answer, pattern),
|
|
suffix=column[-2:], # Remove code block first
|
|
)
|
|
)
|
|
|
|
return results.tolist()
|
|
|
|
|
|
def add_markdown_meta(row):
|
|
conv_meta = {k: v for k, v in row["conv_metadata"].items()}
|
|
return conv_meta | row["markdown_meta_a"] | row["markdown_meta_b"]
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--input-file", type=str, required=True)
|
|
parser.add_argument("--output-file", type=str, required=True)
|
|
args = parser.parse_args()
|
|
|
|
print("loading file...")
|
|
data = pd.read_json(args.input_file)
|
|
|
|
assert "conv_metadata" in data.columns
|
|
|
|
temp = data[["question_id", "conv_metadata"]].copy()
|
|
|
|
print("Processing conversation_a")
|
|
temp["markdown_meta_a"] = get_element_counts(data, column="conversation_a")
|
|
|
|
print("Processing conversation_b")
|
|
temp["markdown_meta_b"] = get_element_counts(data, column="conversation_b")
|
|
|
|
print("Post-processing...")
|
|
data["conv_metadata"] = temp.apply(add_markdown_meta, axis=1)
|
|
|
|
print("Saving to file...")
|
|
data.to_json(args.output_file, orient="records", indent=4, force_ascii=False)
|