BLEUBERI/eval/FastChat/fastchat/serve/monitor/add_markdown_info.py
2025-06-04 20:36:43 +00:00

84 lines
2.7 KiB
Python

import pandas as pd
import re
import argparse
from tqdm import tqdm
tqdm.pandas()
def count_markdown_elements(markdown_text, suffix):
counters = {
f"header_count{suffix}": {
"h1": len(re.findall(r"^#{1}\s", markdown_text, re.MULTILINE)),
"h2": len(re.findall(r"^#{2}\s", markdown_text, re.MULTILINE)),
"h3": len(re.findall(r"^#{3}\s", markdown_text, re.MULTILINE)),
"h4": len(re.findall(r"^#{4}\s", markdown_text, re.MULTILINE)),
"h5": len(re.findall(r"^#{5}\s", markdown_text, re.MULTILINE)),
"h6": len(re.findall(r"^#{6}\s", markdown_text, re.MULTILINE)),
},
f"list_count{suffix}": {
"ordered": len(re.findall(r"^\s*\d+\.\s", markdown_text, re.MULTILINE)),
"unordered": len(re.findall(r"^\s*[-*+]\s", markdown_text, re.MULTILINE)),
},
f"bold_count{suffix}": {
"**": len(re.findall(r"\*\*[^*\n]+\*\*", markdown_text)),
"__": len(re.findall(r"__[^_\n]+__", markdown_text)),
},
}
return counters
def remove_pattern(answer, pattern):
blocks = pattern.findall(answer)
for block in blocks:
answer = answer.replace(block, "")
return answer
def get_element_counts(df, column):
pattern = re.compile("```([^`]*)```")
answers = df[column].map(
lambda convo: "\n".join(
[turn["content"] for turn in convo if turn["role"] == "assistant"]
)
)
results = answers.progress_map(
lambda answer: count_markdown_elements(
remove_pattern(answer, pattern),
suffix=column[-2:], # Remove code block first
)
)
return results.tolist()
def add_markdown_meta(row):
conv_meta = {k: v for k, v in row["conv_metadata"].items()}
return conv_meta | row["markdown_meta_a"] | row["markdown_meta_b"]
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--input-file", type=str, required=True)
parser.add_argument("--output-file", type=str, required=True)
args = parser.parse_args()
print("loading file...")
data = pd.read_json(args.input_file)
assert "conv_metadata" in data.columns
temp = data[["question_id", "conv_metadata"]].copy()
print("Processing conversation_a")
temp["markdown_meta_a"] = get_element_counts(data, column="conversation_a")
print("Processing conversation_b")
temp["markdown_meta_b"] = get_element_counts(data, column="conversation_b")
print("Post-processing...")
data["conv_metadata"] = temp.apply(add_markdown_meta, axis=1)
print("Saving to file...")
data.to_json(args.output_file, orient="records", indent=4, force_ascii=False)