From 737139994a2f28b804bd87bf8816536ea18d9ef8 Mon Sep 17 00:00:00 2001 From: Josh Date: Sun, 18 May 2025 12:52:04 -0700 Subject: [PATCH 1/8] feat: Initial setup for AccessibilityEnv directory and placeholder files --- .../hack0/accessibility_env/README.md | 0 .../accessibility_env/accessibility_env.py | 260 ++++++++++++++++++ .../hack0/accessibility_env/requirements.txt | 0 3 files changed, 260 insertions(+) create mode 100644 environments/hack0/accessibility_env/README.md create mode 100644 environments/hack0/accessibility_env/accessibility_env.py create mode 100644 environments/hack0/accessibility_env/requirements.txt diff --git a/environments/hack0/accessibility_env/README.md b/environments/hack0/accessibility_env/README.md new file mode 100644 index 00000000..e69de29b diff --git a/environments/hack0/accessibility_env/accessibility_env.py b/environments/hack0/accessibility_env/accessibility_env.py new file mode 100644 index 00000000..f9e62d32 --- /dev/null +++ b/environments/hack0/accessibility_env/accessibility_env.py @@ -0,0 +1,260 @@ +# environments/hack0/accessibility_env/accessibility_env.py +from typing import List, Optional, Tuple # Common type hints + +from atroposlib.envs.base import APIServerConfig, BaseEnv, BaseEnvConfig +from atroposlib.type_definitions import ( # Assuming you'll need these + Item, + ScoredDataGroup, +) +from atroposlib.utils.tokenize_for_trainer import tokenize_for_trainer + + +class AccessibilityEnvConfig(BaseEnvConfig): + # Add any custom config fields specific to your env later + pass + + +class AccessibilityEnv(BaseEnv): + name = "accessibility_env" # A unique name for your environment + + def __init__( + self, + config: AccessibilityEnvConfig, + server_configs: List[APIServerConfig], + slurm=True, + testing=False, + ): + super().__init__(config, server_configs, slurm, testing) + # Initialize any env-specific attributes here + + @classmethod + def config_init(cls) -> Tuple[AccessibilityEnvConfig, List[APIServerConfig]]: + env_config = AccessibilityEnvConfig( + tokenizer_name="NousResearch/Llama-3-8B-Instruct- যেভাবে-তুমি-বাংলা-বলো", # Placeholder, change later + group_size=4, # Example, adjust as needed + use_wandb=True, # Recommended for hackathon + rollout_server_url="http://localhost:8000", # Standard Atropos default + total_steps=100, # For process mode, this is more like num_items_to_process + batch_size=8, # Example + steps_per_eval=20, # Less relevant for process-only + max_token_length=2048, # LLM context window + wandb_name="accessibility_env_hackathon", # Your Wandb run name + # data_path_to_save_groups="accessibility_rollouts.jsonl" # Often set via CLI for process + ) + server_configs = [ + APIServerConfig( + model_name="gpt-3.5-turbo", # Placeholder, use your desired model + # base_url="YOUR_LLM_PROVIDER_BASE_URL_IF_NOT_OPENAI_DEFAULT", # e.g., for vLLM + # api_key="YOUR_API_KEY_HERE_OR_USE_ENV_VAR", # Best to use os.environ.get("OPENAI_API_KEY") + num_requests_for_eval=32, # Example + ), + ] + return env_config, server_configs + + async def setup(self): + print(f"[{self.name}] Setting up environment...") + # Load dataset, initialize tools (e.g., HTML parser) here + self.dataset = [] # Placeholder for your HTML snippets + self.iter = 0 + print(f"[{self.name}] Setup complete.") + + async def get_next_item(self) -> Optional[Item]: + if self.iter >= len(self.dataset): + if ( + self.iter >= self.config.total_steps + ): # Stop after total_steps for 'process' + return None + # Potentially loop dataset or handle running out of unique items + # For hackathon, just stopping might be fine if dataset is small + # and total_steps is matched to dataset size. + # self.iter = 0 # To loop + print(f"[{self.name}] Reached end of dataset or total_steps.") + return None + + item_data = self.dataset[self.iter] + self.iter += 1 + # Format item_data into the 'Item' structure Atropos expects + # Typically (prompt_messages_tuple, gold_answer_or_metadata_tuple) + # Example: + # user_prompt = {"role": "user", "content": f"Make this HTML accessible: {item_data['html_snippet']}"} + # system_prompt_content = "You are an AI assistant specializing in web accessibility. Modify the given + # HTML to meet WCAG AA standards. Output only the modified HTML." + # system_prompt = {"role": "system", "content": system_prompt_content} + # prompt_messages = (system_prompt, user_prompt) # This needs to be a tuple of dicts + # messages_for_item = tuple(frozenset(p.items()) for p in prompt_messages) # Atropos often expects this format + # return (messages_for_item, item_data.get('expected_outcome_or_id')) # Second part is for scoring reference + + # Simpler start for prompt: + # prompt = ( + # ( + # { + # "role": "system", + # "content": "You are an AI assistant. Given HTML, make it more accessible.", + # }, + # ), + # ({"role": "user", "content": f"Original HTML: {item_data['html']}"},), + # ) + # This prompt structure might need adjustment based on how Atropos and the LLM API expect it. + # The gsm8k example has: + # user_message = {"role": "user", "content": item["question"]} + # chat_completions = await self.server.chat_completion( + # messages=[{"role": "system", "content": system_prompt}, user_message], ... + # So a list of dicts is passed to chat_completion. + # The 'Item' type for get_next_item is often a tuple: ( (message_part_1, message_part_2, ...), + # metadata_for_scoring ) + # where each message_part is often a frozenset of items from a dict. This is a bit complex. + # Let's start with a simple string prompt and adapt. + # For now, let's assume item is (prompt_string, metadata_for_scoring) + # The `collect_trajectories` in coding_server.py takes `item: Item` + # and then accesses `item[0][0]` which implies item is nested. + # `prompt = tuple([frozenset({"role": "user", "content": next_item["description"]}.items())])` + # `return (prompt, answer)` + # So, first element of item is a tuple of frozensets. + + # Let's simplify for now and refine based on Atropos internals if needed. + # We'll construct the messages list directly in collect_trajectories. + # So get_next_item can return the raw data needed. + return item_data # This will be like {"html": "...", "id": "..."} + + async def collect_trajectories( + self, item_data: Item + ) -> Tuple[Optional[ScoredDataGroup], List[Item]]: + # 'item_data' here is what get_next_item returned. + original_html = item_data["html"] + system_message_content = ( + "You are an expert web developer specializing in accessibility. " + "Given the following HTML snippet, please make the minimal necessary modifications " + "to ensure it meets WCAG 2.1 AA standards for the issues present. " + "Output only the complete, modified HTML snippet. Do not include explanations unless explicitly asked." + ) + user_message_content = ( + f"Original HTML:\n```html\n{original_html}\n```\nModified HTML:" + ) + + messages = [ + {"role": "system", "content": system_message_content}, + {"role": "user", "content": user_message_content}, + ] + + chat_completions = await self.server.chat_completion( + messages=messages, + n=self.config.group_size, + max_tokens=self.config.max_token_length, + # temperature=0.7, # Optional: adjust for creativity vs. determinism + ) + + to_score_inputs = [] + for choice in chat_completions.choices: + llm_response_content = choice.message.content + # The 'messages' to store for scoring/tokenization should represent the full exchange + # that led to this specific llm_response_content. + # This includes the original system and user messages, and the assistant's response. + full_exchange_messages = messages + [ + {"role": "assistant", "content": llm_response_content} + ] + to_score_inputs.append( + { + "full_exchange_messages": full_exchange_messages, # For tokenization + "llm_modified_html": llm_response_content, # For direct scoring + "original_html_info": item_data, # To know what to check against + } + ) + + # The `score` method in Atropos expects a list where each element typically is + # (messages_tuple_for_tokenization, original_item_metadata_for_scoring_logic) + # We need to adapt `to_score_inputs` to what `self.score` will expect. + # Let's define that `self.score` will take this list of dicts directly. + # The `collect_trajectories` from the blog post returns `to_postprocess, to_backlog` + # where `to_postprocess` is the output of `self.score`. + + scored_data_group = await self.score(to_score_inputs) + return scored_data_group, [] # Assuming no backlog for now + + async def score(self, rollout_group_data: List[dict]) -> Optional[ScoredDataGroup]: + # rollout_group_data is a list of dicts, each like: + # { + # "full_exchange_messages": [...], + # "llm_modified_html": "...", + # "original_html_info": {"html": "...", "id": "...", "issues": [...]} + # } + print(f"[{self.name}] Scoring {len(rollout_group_data)} rollouts...") + scores_obj = ScoredDataGroup() # Use the Atropos defined type + # Initialize lists within scores_obj as per ScoredDataGroup structure + # (typically 'tokens', 'masks', 'scores', maybe 'logprobs') + scores_obj["tokens"] = [] + scores_obj["masks"] = [] + scores_obj["scores"] = [] + # scores_obj["infos"] = [] # Optional for extra debug info + + for data_item in rollout_group_data: + llm_html = data_item["llm_modified_html"] + original_info = data_item["original_html_info"] + + # Basic reward: 1.0 if fixed, -1.0 if not. + # This will be replaced with actual WCAG checks. + current_score = -1.0 # Default to failure + # ---- YOUR SCORING LOGIC HERE ---- + # Example: (pseudo-code, requires BeautifulSoup and specific checks) + # violations_fixed = self.check_wcag_fixes(llm_html, original_info) + # if violations_fixed: + # current_score = 1.0 + # For now, a placeholder: + if "" in original_info["html"] and "for=" in llm_html: + current_score = 1.0 + + # Tokenize the full exchange for the trainer + # The 'tokenize_for_trainer' util expects a tuple/list of message dicts + tokenized_output = tokenize_for_trainer( + self.tokenizer, + data_item["full_exchange_messages"], # Pass the list of message dicts + ) + + # Ensure tokenized_output contains 'tokens' and 'masks' + if "tokens" not in tokenized_output or "masks" not in tokenized_output: + print( + f"[{self.name}] Warning: Tokenization did not return tokens/masks for an item. Skipping." + ) + continue + + scores_obj["tokens"].append(tokenized_output["tokens"]) + scores_obj["masks"].append(tokenized_output["masks"]) + scores_obj["scores"].append(current_score) + # scores_obj["infos"].append({"original_id": original_info["id"], "llm_output_preview": llm_html[:100]}) + + # Handle case where no valid items were scored + if not scores_obj["scores"]: + print(f"[{self.name}] No valid items to score, returning None.") + return None + + # Atropos convention: if all scores are identical, return None (no learning signal) + # This might be too strict for early testing. Consider enabling later. + # if len(set(scores_obj["scores"])) == 1 and len(scores_obj["scores"]) > 1 : + # print(f"[{self.name}] All scores are identical ({scores_obj['scores'][0]}), returning None.") + # return None + + print(f"[{self.name}] Scoring complete. Scores: {scores_obj['scores']}") + return scores_obj + + async def evaluate( + self, + ): # Optional, might not be needed for hackathon 'process' focus + print(f"[{self.name}] Evaluate method called (placeholder).") + # Implement evaluation logic if you have a separate test set and metrics + pass + + # --- Helper methods for scoring --- + # def check_wcag_fixes(self, modified_html: str, original_item_info: dict) -> bool: + # # Placeholder for your actual WCAG checking logic + # # e.g., using BeautifulSoup to parse modified_html + # # and checking against `original_item_info['issues_to_fix']` + # # from bs4 import BeautifulSoup + # # soup = BeautifulSoup(modified_html, 'html.parser') + # # ... logic ... + # return False + + +if __name__ == "__main__": + # This makes your environment runnable with `python accessibility_env.py process` + AccessibilityEnv.cli() diff --git a/environments/hack0/accessibility_env/requirements.txt b/environments/hack0/accessibility_env/requirements.txt new file mode 100644 index 00000000..e69de29b From 659247fc00fe83554f203a2df813aa43d87f98bb Mon Sep 17 00:00:00 2001 From: Josh Date: Sun, 18 May 2025 13:07:08 -0700 Subject: [PATCH 2/8] Fix environment issues. `Safely ran python3 accessibility_env.py --help` --- .../accessibility_env/accessibility_env.py | 43 +++++++++++-------- 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/environments/hack0/accessibility_env/accessibility_env.py b/environments/hack0/accessibility_env/accessibility_env.py index f9e62d32..354c1131 100644 --- a/environments/hack0/accessibility_env/accessibility_env.py +++ b/environments/hack0/accessibility_env/accessibility_env.py @@ -1,11 +1,17 @@ # environments/hack0/accessibility_env/accessibility_env.py -from typing import List, Optional, Tuple # Common type hints +import os # For API keys, etc. +from typing import List, Optional, Tuple # Common type hints, added Dict -from atroposlib.envs.base import APIServerConfig, BaseEnv, BaseEnvConfig -from atroposlib.type_definitions import ( # Assuming you'll need these - Item, +# Corrected imports for Atropos types +from atroposlib.envs.base import ( + APIServerConfig, + BaseEnv, + BaseEnvConfig, ScoredDataGroup, ) +from atroposlib.type_definitions import ( # GameHistory might not be needed yet, Item is common + Item, +) from atroposlib.utils.tokenize_for_trainer import tokenize_for_trainer @@ -30,23 +36,24 @@ class AccessibilityEnv(BaseEnv): @classmethod def config_init(cls) -> Tuple[AccessibilityEnvConfig, List[APIServerConfig]]: env_config = AccessibilityEnvConfig( - tokenizer_name="NousResearch/Llama-3-8B-Instruct- যেভাবে-তুমি-বাংলা-বলো", # Placeholder, change later - group_size=4, # Example, adjust as needed - use_wandb=True, # Recommended for hackathon - rollout_server_url="http://localhost:8000", # Standard Atropos default - total_steps=100, # For process mode, this is more like num_items_to_process - batch_size=8, # Example - steps_per_eval=20, # Less relevant for process-only - max_token_length=2048, # LLM context window - wandb_name="accessibility_env_hackathon", # Your Wandb run name - # data_path_to_save_groups="accessibility_rollouts.jsonl" # Often set via CLI for process + tokenizer_name="NousResearch/Llama-3-8B-Instruct- যেভাবে-তুমি-বাংলা-বলো", # Placeholder + group_size=2, # Smaller for faster testing initially + use_wandb=True, + rollout_server_url="http://localhost:8000", + total_steps=10, # For process mode, number of items to generate + batch_size=4, # Max items in a single call to score (related to group_size) + steps_per_eval=5, + max_token_length=2048, + wandb_name="accessibility_env_hackathon_dev", # Dev run name ) server_configs = [ APIServerConfig( - model_name="gpt-3.5-turbo", # Placeholder, use your desired model - # base_url="YOUR_LLM_PROVIDER_BASE_URL_IF_NOT_OPENAI_DEFAULT", # e.g., for vLLM - # api_key="YOUR_API_KEY_HERE_OR_USE_ENV_VAR", # Best to use os.environ.get("OPENAI_API_KEY") - num_requests_for_eval=32, # Example + model_name="gpt-3.5-turbo", # Or your preferred model + # base_url=None, # Defaults to OpenAI if None + api_key=os.environ.get( + "OPENAI_API_KEY", "YOUR_API_KEY_PLACEHOLDER_IF_NOT_SET" + ), # Important! + num_requests_for_eval=16, ), ] return env_config, server_configs From 94038876f4174fe72edbd1e0596f560425b53eac Mon Sep 17 00:00:00 2001 From: Josh Date: Sun, 18 May 2025 13:32:59 -0700 Subject: [PATCH 3/8] Add tokenizer. Fix typing --- .../accessibility_env/accessibility_env.py | 161 ++++++++++++------ 1 file changed, 112 insertions(+), 49 deletions(-) diff --git a/environments/hack0/accessibility_env/accessibility_env.py b/environments/hack0/accessibility_env/accessibility_env.py index 354c1131..b3095680 100644 --- a/environments/hack0/accessibility_env/accessibility_env.py +++ b/environments/hack0/accessibility_env/accessibility_env.py @@ -1,6 +1,8 @@ # environments/hack0/accessibility_env/accessibility_env.py import os # For API keys, etc. -from typing import List, Optional, Tuple # Common type hints, added Dict +from typing import Dict, List, Optional, Tuple # Common type hints, added Dict + +from transformers.models.auto.tokenization_auto import AutoTokenizer # Corrected imports for Atropos types from atroposlib.envs.base import ( @@ -61,9 +63,39 @@ class AccessibilityEnv(BaseEnv): async def setup(self): print(f"[{self.name}] Setting up environment...") # Load dataset, initialize tools (e.g., HTML parser) here - self.dataset = [] # Placeholder for your HTML snippets + try: + self.tokenizer = AutoTokenizer.from_pretrained( + self.config.tokenizer_name, trust_remote_code=True + ) + # It's good practice to set pad_token if it's not already set, common for GPT-like models + if self.tokenizer.pad_token is None: + self.tokenizer.pad_token = self.tokenizer.eos_token + print( + f"[{self.name}] Tokenizer '{self.config.tokenizer_name}' loaded successfully." + ) + except Exception as e: + print( + f"[{self.name}] Error loading tokenizer '{self.config.tokenizer_name}': {e}" + ) + # Decide how to handle this - raise error, or try to proceed without tokenization (not ideal for Atropos) + # For now, let's allow it to proceed, but tokenization will fail later if tokenizer is None + # A better approach might be to raise an exception here if tokenizer is critical. + # raise RuntimeError(f"Failed to load tokenizer: {e}") from e + + self.dataset = [ + { + "id": "ex001", + "html": "

Welcome

", + "issues_to_fix": ["missing_alt_text"], + }, + { + "id": "ex002", + "html": "", + "issues_to_fix": ["missing_for_attribute_on_label"], + }, + ] # Placeholder for your HTML snippets self.iter = 0 - print(f"[{self.name}] Setup complete.") + print(f"[{self.name}] Setup complete. Loaded {len(self.dataset)} items.") async def get_next_item(self) -> Optional[Item]: if self.iter >= len(self.dataset): @@ -124,10 +156,10 @@ class AccessibilityEnv(BaseEnv): return item_data # This will be like {"html": "...", "id": "..."} async def collect_trajectories( - self, item_data: Item + self, item: Item ) -> Tuple[Optional[ScoredDataGroup], List[Item]]: # 'item_data' here is what get_next_item returned. - original_html = item_data["html"] + original_html = item["html"] system_message_content = ( "You are an expert web developer specializing in accessibility. " "Given the following HTML snippet, please make the minimal necessary modifications " @@ -163,7 +195,7 @@ class AccessibilityEnv(BaseEnv): { "full_exchange_messages": full_exchange_messages, # For tokenization "llm_modified_html": llm_response_content, # For direct scoring - "original_html_info": item_data, # To know what to check against + "original_html_info": item, # To know what to check against } ) @@ -177,72 +209,103 @@ class AccessibilityEnv(BaseEnv): scored_data_group = await self.score(to_score_inputs) return scored_data_group, [] # Assuming no backlog for now - async def score(self, rollout_group_data: List[dict]) -> Optional[ScoredDataGroup]: - # rollout_group_data is a list of dicts, each like: - # { - # "full_exchange_messages": [...], - # "llm_modified_html": "...", - # "original_html_info": {"html": "...", "id": "...", "issues": [...]} - # } + async def score( + self, rollout_group_data: List[dict] + ) -> Optional[ScoredDataGroup]: # Return type is still ScoredDataGroup print(f"[{self.name}] Scoring {len(rollout_group_data)} rollouts...") - scores_obj = ScoredDataGroup() # Use the Atropos defined type - # Initialize lists within scores_obj as per ScoredDataGroup structure - # (typically 'tokens', 'masks', 'scores', maybe 'logprobs') - scores_obj["tokens"] = [] - scores_obj["masks"] = [] - scores_obj["scores"] = [] - # scores_obj["infos"] = [] # Optional for extra debug info + + all_tokens: List[List[int]] = [] + all_masks: List[List[int]] = [] + all_scores: List[float] = [] + # For TypedDict, optional fields that are not provided will simply not be keys in the dictionary. + # However, if we want to include them as None, we can. Let's prepare for that. + all_advantages: Optional[List[List[float]]] = ( + None # Or initialize as [] if you might populate it + ) + all_ref_logprobs: Optional[List[List[float]]] = None # Or initialize as [] + all_messages_for_trainer: Optional[List[List[Dict]]] = ( + None # Assuming Message is also a dict-like structure or TypedDict + ) for data_item in rollout_group_data: llm_html = data_item["llm_modified_html"] original_info = data_item["original_html_info"] - # Basic reward: 1.0 if fixed, -1.0 if not. - # This will be replaced with actual WCAG checks. - current_score = -1.0 # Default to failure - # ---- YOUR SCORING LOGIC HERE ---- - # Example: (pseudo-code, requires BeautifulSoup and specific checks) - # violations_fixed = self.check_wcag_fixes(llm_html, original_info) - # if violations_fixed: - # current_score = 1.0 - # For now, a placeholder: + current_score = -1.0 if "" in original_info["html"] and "for=" in llm_html: current_score = 1.0 - # Tokenize the full exchange for the trainer - # The 'tokenize_for_trainer' util expects a tuple/list of message dicts - tokenized_output = tokenize_for_trainer( - self.tokenizer, - data_item["full_exchange_messages"], # Pass the list of message dicts - ) + try: + # Ensure self.tokenizer is initialized in __init__ or setup + if not hasattr(self, "tokenizer") or self.tokenizer is None: + print(f"[{self.name}] Error: Tokenizer not initialized.") + # Attempt to initialize it here if it makes sense, or ensure it's done in setup() + # from transformers import AutoTokenizer + # self.tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_name, trust_remote_code=True) + # This is a fallback, better to ensure it's in setup() + # For now, let's assume it's there. If not, this will fail earlier or be caught by linter. + pass # Assuming tokenizer is initialized + + tokenized_output = tokenize_for_trainer( + self.tokenizer, # Make sure self.tokenizer is loaded, e.g., in setup() + data_item["full_exchange_messages"], + ) + except Exception as e: + print(f"[{self.name}] Error during tokenization: {e}. Skipping item.") + continue - # Ensure tokenized_output contains 'tokens' and 'masks' if "tokens" not in tokenized_output or "masks" not in tokenized_output: print( f"[{self.name}] Warning: Tokenization did not return tokens/masks for an item. Skipping." ) continue - scores_obj["tokens"].append(tokenized_output["tokens"]) - scores_obj["masks"].append(tokenized_output["masks"]) - scores_obj["scores"].append(current_score) - # scores_obj["infos"].append({"original_id": original_info["id"], "llm_output_preview": llm_html[:100]}) + all_tokens.append(tokenized_output["tokens"]) + all_masks.append(tokenized_output["masks"]) + all_scores.append(current_score) - # Handle case where no valid items were scored - if not scores_obj["scores"]: + # If you were to populate optional fields, you'd do it here. For example: + # if "advantages" in tokenized_output: # Fictional example + # if all_advantages is None: all_advantages = [] + # all_advantages.append(tokenized_output["advantages"]) + + if not all_scores: print(f"[{self.name}] No valid items to score, returning None.") return None - # Atropos convention: if all scores are identical, return None (no learning signal) - # This might be too strict for early testing. Consider enabling later. - # if len(set(scores_obj["scores"])) == 1 and len(scores_obj["scores"]) > 1 : - # print(f"[{self.name}] All scores are identical ({scores_obj['scores'][0]}), returning None.") - # return None + # print(f"[{self.name}] Scoring complete. Scores: {all_scores}") # Already printed if successful below - print(f"[{self.name}] Scoring complete. Scores: {scores_obj['scores']}") - return scores_obj + # Construct the dictionary that conforms to ScoredDataGroup TypedDict + # Mandatory fields: + data_to_return: ScoredDataGroup = { + "tokens": all_tokens, + "masks": all_masks, + "scores": all_scores, + "advantages": None, + "ref_logprobs": None, + "group_overrides": None, + "messages": None, + "overrides": None, + } + + # Add optional fields if they have values (or if you explicitly want them as None) + # The TypedDict definition uses Optional[], so if a key is missing, it's fine. + # If you want to explicitly include them as None if not populated: + if all_advantages is not None: + data_to_return["advantages"] = all_advantages + if all_ref_logprobs is not None: + data_to_return["ref_logprobs"] = all_ref_logprobs + if all_messages_for_trainer is not None: + data_to_return["messages"] = all_messages_for_trainer + # group_overrides and overrides are also optional + + print( + f"""[{self.name}] Scoring complete. Data to return (first score): + {data_to_return['scores'][0] if data_to_return['scores'] else 'N/A'}""" + ) + return data_to_return async def evaluate( self, From 8ff2b02ce01df3c1a9401f21a6911f6042f02d0a Mon Sep 17 00:00:00 2001 From: Josh Date: Sun, 18 May 2025 14:44:16 -0700 Subject: [PATCH 4/8] Working POC --- .../accessibility_env/accessibility_env.py | 208 ++++++++++++++---- .../hack0/accessibility_env/requirements.txt | 1 + 2 files changed, 170 insertions(+), 39 deletions(-) diff --git a/environments/hack0/accessibility_env/accessibility_env.py b/environments/hack0/accessibility_env/accessibility_env.py index b3095680..d46aebbb 100644 --- a/environments/hack0/accessibility_env/accessibility_env.py +++ b/environments/hack0/accessibility_env/accessibility_env.py @@ -2,6 +2,9 @@ import os # For API keys, etc. from typing import Dict, List, Optional, Tuple # Common type hints, added Dict +import tenacity + +# from bs4 import BeautifulSoup from transformers.models.auto.tokenization_auto import AutoTokenizer # Corrected imports for Atropos types @@ -38,23 +41,26 @@ class AccessibilityEnv(BaseEnv): @classmethod def config_init(cls) -> Tuple[AccessibilityEnvConfig, List[APIServerConfig]]: env_config = AccessibilityEnvConfig( - tokenizer_name="NousResearch/Llama-3-8B-Instruct- যেভাবে-তুমি-বাংলা-বলো", # Placeholder - group_size=2, # Smaller for faster testing initially + tokenizer_name="meta-llama/Llama-2-7b-chat-hf", + group_size=1, # Smaller for faster testing initially use_wandb=True, rollout_server_url="http://localhost:8000", - total_steps=10, # For process mode, number of items to generate - batch_size=4, # Max items in a single call to score (related to group_size) + total_steps=3, # For process mode, number of items to generate + batch_size=1, # Max items in a single call to score (related to group_size) steps_per_eval=5, max_token_length=2048, - wandb_name="accessibility_env_hackathon_dev", # Dev run name + wandb_name="accessibility_llama_dev", # Dev run name ) + + llama_api_key = os.environ.get("LLAMA_API_KEY") + if not llama_api_key: + print("WARNING: LLAMA_API_KEY environment variable not set!") + server_configs = [ APIServerConfig( - model_name="gpt-3.5-turbo", # Or your preferred model - # base_url=None, # Defaults to OpenAI if None - api_key=os.environ.get( - "OPENAI_API_KEY", "YOUR_API_KEY_PLACEHOLDER_IF_NOT_SET" - ), # Important! + model_name="Llama-4-Maverick-17B-128E-Instruct-FP8", + base_url="https://api.llama.com/v1", # <<<---- Llama API base URL + api_key=llama_api_key, num_requests_for_eval=16, ), ] @@ -62,14 +68,34 @@ class AccessibilityEnv(BaseEnv): async def setup(self): print(f"[{self.name}] Setting up environment...") - # Load dataset, initialize tools (e.g., HTML parser) here try: self.tokenizer = AutoTokenizer.from_pretrained( self.config.tokenizer_name, trust_remote_code=True - ) - # It's good practice to set pad_token if it's not already set, common for GPT-like models + ) # tokenizer_name is 'gpt2' if self.tokenizer.pad_token is None: self.tokenizer.pad_token = self.tokenizer.eos_token + + # Set a default chat template if it's not already set + # This is crucial for tokenizers like 'gpt2' that don't have one by default. + if self.tokenizer.chat_template is None: + # A common, simple template. You might need to adjust based on how gpt-3.5-turbo expects chat. + # For gpt-3.5-turbo, the actual formatting is handled by the API, + # but for local tokenization for the trainer, we need *a* template. + # A basic template for generic tokenization: + self.tokenizer.chat_template = ( + "{% for message in messages %}" + "{{ message['role'] + ': ' + message['content'] + '\\n' }}" + "{% endfor %}" + ) + # Alternatively, for many models, a more structured Jinja template like + # the Llama or ChatML one might be used if you were training with such a format. + # For just getting token IDs for a generic model for RL, the simple one above might suffice. + # Or, if tokenize_for_trainer is smart, it might just concatenate. + # Let's check if a simpler approach is needed for tokenize_for_trainer. + print( + f"[{self.name}] Set a default chat_template for tokenizer '{self.config.tokenizer_name}'." + ) + print( f"[{self.name}] Tokenizer '{self.config.tokenizer_name}' loaded successfully." ) @@ -77,10 +103,7 @@ class AccessibilityEnv(BaseEnv): print( f"[{self.name}] Error loading tokenizer '{self.config.tokenizer_name}': {e}" ) - # Decide how to handle this - raise error, or try to proceed without tokenization (not ideal for Atropos) - # For now, let's allow it to proceed, but tokenization will fail later if tokenizer is None - # A better approach might be to raise an exception here if tokenizer is critical. - # raise RuntimeError(f"Failed to load tokenizer: {e}") from e + raise RuntimeError(f"Failed to load tokenizer: {e}") from e self.dataset = [ { @@ -93,7 +116,7 @@ class AccessibilityEnv(BaseEnv): "html": "", "issues_to_fix": ["missing_for_attribute_on_label"], }, - ] # Placeholder for your HTML snippets + ] self.iter = 0 print(f"[{self.name}] Setup complete. Loaded {len(self.dataset)} items.") @@ -175,12 +198,130 @@ class AccessibilityEnv(BaseEnv): {"role": "user", "content": user_message_content}, ] - chat_completions = await self.server.chat_completion( - messages=messages, - n=self.config.group_size, - max_tokens=self.config.max_token_length, - # temperature=0.7, # Optional: adjust for creativity vs. determinism - ) + try: + chat_completions = await self.server.chat_completion( + messages=messages, + n=self.config.group_size, # Number of completions + # `max_tokens` here is for the *completion* part, not the whole context. + # Your Llama API example used 256. Adjust as needed for HTML output. + max_tokens=1024, # Max tokens for the LLM's response + # temperature=0.7, # Optional: adjust for creativity vs. determinism + # model=self.server_configs[0].model_name # This should be picked up automatically from server_configs + # by the self.server object. + ) + except tenacity.RetryError as retry_err: # Specifically catch RetryError + print( + "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" + ) + print(f"[{self.name}] TENACITY RETRY ERROR during chat_completion call:") + print(f"[{self.name}] RetryError Details: {retry_err}") + # ... and the response details if available on 'e' ... + original_exception = None + if retry_err.last_attempt: + if retry_err.last_attempt.failed: + original_exception = retry_err.last_attempt.exception() + print( + f"[{self.name}] Last attempt failed. Original exception that caused retries:" + ) + print(f"[{self.name}] Type: {type(original_exception)}") + print( + f"[{self.name}] Args: {original_exception.args if original_exception else 'N/A'}" + ) + print( + f"[{self.name}] Full Str: {str(original_exception)}" + ) # More direct string representation + else: + # This case is unusual for a RetryError due to failure + print( + f"""[{self.name}] Last attempt recorded but did not 'fail'. + Result: {retry_err.last_attempt.result()}""" + ) + else: + print( + f"""[{self.name}] Could not get 'last_attempt' details from + RetryError object. Raw RetryError: {retry_err}""" + ) + + # Now, if we have the original_exception, try to get more details (like HTTP response) + if original_exception: + # Check if the original exception is an OpenAI/HTTPX style error + # by looking for a 'response' attribute. + if ( + hasattr(original_exception, "response") + and original_exception.response is not None + ): + response_obj = original_exception.response + status_code_text = "Status code N/A" + response_content_text = "Response content N/A" + + if hasattr(response_obj, "status_code"): + status_code_text = str(response_obj.status_code) + + print( + f"[{self.name}] Underlying API Response Status Code: {status_code_text}" + ) + + # Try to get JSON content first (common for API errors) + if hasattr(response_obj, "json") and callable(response_obj.json): + try: + response_json_parsed = ( + response_obj.json() + ) # Note: this might need to be awaited if response_obj.json is async + # but typically in an exception, it's already processed. + print( + f"[{self.name}] Underlying API Response JSON: {response_json_parsed}" + ) + except Exception as json_e_inner: + print( + f"[{self.name}] Could not parse underlying API response as JSON: {json_e_inner}" + ) + # Fallback to text if JSON parsing fails + if hasattr(response_obj, "text"): + response_content_text = response_obj.text + print( + f"[{self.name}] Underlying API Response Text: {response_content_text}" + ) + elif hasattr(response_obj, "content"): # often bytes + try: + response_content_text = ( + response_obj.content.decode() + ) + print( + f"""[{self.name}] Underlying API Response + Content (decoded): {response_content_text}""" + ) + except Exception: + response_content_text = str(response_obj.content) + print( + f"""[{self.name}] Underlying API Response Content + (raw bytes as str): {response_content_text}""" + ) + # If no json() method, try .text or .content directly + elif hasattr(response_obj, "text"): + response_content_text = response_obj.text + print( + f"[{self.name}] Underlying API Response Text: {response_content_text}" + ) + elif hasattr(response_obj, "content"): + try: + response_content_text = response_obj.content.decode() + print( + f"[{self.name}] Underlying API Response Content (decoded): {response_content_text}" + ) + except Exception: + response_content_text = str(response_obj.content) + print( + f"""[{self.name}] Underlying API Response Content + (raw bytes as str): {response_content_text}""" + ) + + print( + "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" + ) + print( + f"[{self.name}] Messages that were sent during the attempt resulting in RetryError: {messages}" + ) + return None, [] to_score_inputs = [] for choice in chat_completions.choices: @@ -283,24 +424,13 @@ class AccessibilityEnv(BaseEnv): "tokens": all_tokens, "masks": all_masks, "scores": all_scores, - "advantages": None, - "ref_logprobs": None, - "group_overrides": None, - "messages": None, + "advantages": all_advantages, + "ref_logprobs": all_ref_logprobs, + "group_overrides": {}, + "messages": all_messages_for_trainer, "overrides": None, } - # Add optional fields if they have values (or if you explicitly want them as None) - # The TypedDict definition uses Optional[], so if a key is missing, it's fine. - # If you want to explicitly include them as None if not populated: - if all_advantages is not None: - data_to_return["advantages"] = all_advantages - if all_ref_logprobs is not None: - data_to_return["ref_logprobs"] = all_ref_logprobs - if all_messages_for_trainer is not None: - data_to_return["messages"] = all_messages_for_trainer - # group_overrides and overrides are also optional - print( f"""[{self.name}] Scoring complete. Data to return (first score): {data_to_return['scores'][0] if data_to_return['scores'] else 'N/A'}""" diff --git a/environments/hack0/accessibility_env/requirements.txt b/environments/hack0/accessibility_env/requirements.txt index e69de29b..c1f5f713 100644 --- a/environments/hack0/accessibility_env/requirements.txt +++ b/environments/hack0/accessibility_env/requirements.txt @@ -0,0 +1 @@ +beautifulsoup4 From 904360a02e3bcee92830dc2df1d6bfc5fc445e57 Mon Sep 17 00:00:00 2001 From: Josh Date: Sun, 18 May 2025 17:38:29 -0700 Subject: [PATCH 5/8] Cleanup. End-to-end functionality in place --- .../accessibility_env/accessibility_env.py | 526 +++++++----------- .../accessibility_env/accessibility_rules.py | 137 +++++ .../accessibility_env/compressed_rollouts.zip | Bin 0 -> 4393 bytes .../hack0/accessibility_env/requirements.txt | 3 +- 4 files changed, 351 insertions(+), 315 deletions(-) create mode 100644 environments/hack0/accessibility_env/accessibility_rules.py create mode 100644 environments/hack0/accessibility_env/compressed_rollouts.zip diff --git a/environments/hack0/accessibility_env/accessibility_env.py b/environments/hack0/accessibility_env/accessibility_env.py index d46aebbb..79dbd756 100644 --- a/environments/hack0/accessibility_env/accessibility_env.py +++ b/environments/hack0/accessibility_env/accessibility_env.py @@ -1,32 +1,37 @@ -# environments/hack0/accessibility_env/accessibility_env.py -import os # For API keys, etc. -from typing import Dict, List, Optional, Tuple # Common type hints, added Dict +import json +import os +from typing import Dict, List, Optional, Tuple -import tenacity - -# from bs4 import BeautifulSoup +from bs4 import BeautifulSoup +from pydantic import Field from transformers.models.auto.tokenization_auto import AutoTokenizer -# Corrected imports for Atropos types from atroposlib.envs.base import ( APIServerConfig, BaseEnv, BaseEnvConfig, ScoredDataGroup, ) -from atroposlib.type_definitions import ( # GameHistory might not be needed yet, Item is common - Item, -) +from atroposlib.type_definitions import Item from atroposlib.utils.tokenize_for_trainer import tokenize_for_trainer +from .accessibility_rules import ( + AccessibilityRule, + LabelAssociationRule, + MissingAltTextRule, +) + class AccessibilityEnvConfig(BaseEnvConfig): - # Add any custom config fields specific to your env later - pass + dataset_path: str = Field( + default="data/accessibility_dataset.jsonl", # Default relative path + description="Path to the JSONL file containing the accessibility dataset.", + ) class AccessibilityEnv(BaseEnv): - name = "accessibility_env" # A unique name for your environment + config: AccessibilityEnvConfig + name = "accessibility_env" def __init__( self, @@ -36,33 +41,44 @@ class AccessibilityEnv(BaseEnv): testing=False, ): super().__init__(config, server_configs, slurm, testing) - # Initialize any env-specific attributes here + self.tokenizer = None + + # Initialize your list of rule instances + self.accessibility_rules: List[AccessibilityRule] = [ + MissingAltTextRule(), + LabelAssociationRule(), + ] + + # For quick lookup if needed, though iterating self.accessibility_rules is fine + self.rules_by_key = {rule.issue_key: rule for rule in self.accessibility_rules} @classmethod def config_init(cls) -> Tuple[AccessibilityEnvConfig, List[APIServerConfig]]: + current_dataset_size = 10 + env_config = AccessibilityEnvConfig( - tokenizer_name="meta-llama/Llama-2-7b-chat-hf", - group_size=1, # Smaller for faster testing initially + tokenizer_name="gpt2", + group_size=8, use_wandb=True, rollout_server_url="http://localhost:8000", - total_steps=3, # For process mode, number of items to generate - batch_size=1, # Max items in a single call to score (related to group_size) - steps_per_eval=5, + total_steps=current_dataset_size, + batch_size=1, + steps_per_eval=current_dataset_size, max_token_length=2048, - wandb_name="accessibility_llama_dev", # Dev run name + wandb_name="accessibility_openai_default_dev", ) - llama_api_key = os.environ.get("LLAMA_API_KEY") - if not llama_api_key: - print("WARNING: LLAMA_API_KEY environment variable not set!") + openai_api_key_from_env = os.environ.get("OPENAI_API_KEY") + if not openai_api_key_from_env: + print( + "WARNING (from config_init): OPENAI_API_KEY environment variable not set for default config!" + ) server_configs = [ APIServerConfig( - model_name="Llama-4-Maverick-17B-128E-Instruct-FP8", - base_url="https://api.llama.com/v1", # <<<---- Llama API base URL - api_key=llama_api_key, - num_requests_for_eval=16, - ), + model_name="gpt-3.5-turbo", + api_key=openai_api_key_from_env, + ) ] return env_config, server_configs @@ -71,27 +87,17 @@ class AccessibilityEnv(BaseEnv): try: self.tokenizer = AutoTokenizer.from_pretrained( self.config.tokenizer_name, trust_remote_code=True - ) # tokenizer_name is 'gpt2' + ) if self.tokenizer.pad_token is None: self.tokenizer.pad_token = self.tokenizer.eos_token - # Set a default chat template if it's not already set - # This is crucial for tokenizers like 'gpt2' that don't have one by default. if self.tokenizer.chat_template is None: - # A common, simple template. You might need to adjust based on how gpt-3.5-turbo expects chat. - # For gpt-3.5-turbo, the actual formatting is handled by the API, - # but for local tokenization for the trainer, we need *a* template. - # A basic template for generic tokenization: self.tokenizer.chat_template = ( "{% for message in messages %}" "{{ message['role'] + ': ' + message['content'] + '\\n' }}" "{% endfor %}" ) - # Alternatively, for many models, a more structured Jinja template like - # the Llama or ChatML one might be used if you were training with such a format. - # For just getting token IDs for a generic model for RL, the simple one above might suffice. - # Or, if tokenize_for_trainer is smart, it might just concatenate. - # Let's check if a simpler approach is needed for tokenize_for_trainer. + print( f"[{self.name}] Set a default chat_template for tokenizer '{self.config.tokenizer_name}'." ) @@ -105,83 +111,57 @@ class AccessibilityEnv(BaseEnv): ) raise RuntimeError(f"Failed to load tokenizer: {e}") from e - self.dataset = [ - { - "id": "ex001", - "html": "

Welcome

", - "issues_to_fix": ["missing_alt_text"], - }, - { - "id": "ex002", - "html": "", - "issues_to_fix": ["missing_for_attribute_on_label"], - }, - ] + # Load dataset from file + self.dataset: List[Dict] = [] + env_script_dir = os.path.dirname(os.path.abspath(__file__)) + full_dataset_path = os.path.join(env_script_dir, self.config.dataset_path) + + print(f"[{self.name}] Attempting to load dataset from: {full_dataset_path}") + try: + with open(full_dataset_path, "r", encoding="utf-8") as f: + for line in f: + if line.strip(): # Ensure line is not empty + self.dataset.append(json.loads(line)) + if not self.dataset: + raise FileNotFoundError( + "Dataset file was empty or contained no valid JSON lines." + ) + except FileNotFoundError: + print(f"[{self.name}] ERROR: Dataset file not found at {full_dataset_path}") + raise + except json.JSONDecodeError as e: + print( + f"[{self.name}] ERROR: Failed to decode JSON from {full_dataset_path}. Error: {e}" + ) + raise + except Exception as e: + print( + f"[{self.name}] ERROR: An unexpected error occurred while loading dataset: {e}" + ) + raise + self.iter = 0 - print(f"[{self.name}] Setup complete. Loaded {len(self.dataset)} items.") + print( + f"""[{self.name}] Setup complete. Loaded {len(self.dataset)} + items. Initialized {len(self.accessibility_rules)} accessibility rules.""" + ) async def get_next_item(self) -> Optional[Item]: if self.iter >= len(self.dataset): - if ( - self.iter >= self.config.total_steps - ): # Stop after total_steps for 'process' + if self.iter >= self.config.total_steps: return None - # Potentially loop dataset or handle running out of unique items - # For hackathon, just stopping might be fine if dataset is small - # and total_steps is matched to dataset size. - # self.iter = 0 # To loop + print(f"[{self.name}] Reached end of dataset or total_steps.") return None item_data = self.dataset[self.iter] self.iter += 1 - # Format item_data into the 'Item' structure Atropos expects - # Typically (prompt_messages_tuple, gold_answer_or_metadata_tuple) - # Example: - # user_prompt = {"role": "user", "content": f"Make this HTML accessible: {item_data['html_snippet']}"} - # system_prompt_content = "You are an AI assistant specializing in web accessibility. Modify the given - # HTML to meet WCAG AA standards. Output only the modified HTML." - # system_prompt = {"role": "system", "content": system_prompt_content} - # prompt_messages = (system_prompt, user_prompt) # This needs to be a tuple of dicts - # messages_for_item = tuple(frozenset(p.items()) for p in prompt_messages) # Atropos often expects this format - # return (messages_for_item, item_data.get('expected_outcome_or_id')) # Second part is for scoring reference - # Simpler start for prompt: - # prompt = ( - # ( - # { - # "role": "system", - # "content": "You are an AI assistant. Given HTML, make it more accessible.", - # }, - # ), - # ({"role": "user", "content": f"Original HTML: {item_data['html']}"},), - # ) - # This prompt structure might need adjustment based on how Atropos and the LLM API expect it. - # The gsm8k example has: - # user_message = {"role": "user", "content": item["question"]} - # chat_completions = await self.server.chat_completion( - # messages=[{"role": "system", "content": system_prompt}, user_message], ... - # So a list of dicts is passed to chat_completion. - # The 'Item' type for get_next_item is often a tuple: ( (message_part_1, message_part_2, ...), - # metadata_for_scoring ) - # where each message_part is often a frozenset of items from a dict. This is a bit complex. - # Let's start with a simple string prompt and adapt. - # For now, let's assume item is (prompt_string, metadata_for_scoring) - # The `collect_trajectories` in coding_server.py takes `item: Item` - # and then accesses `item[0][0]` which implies item is nested. - # `prompt = tuple([frozenset({"role": "user", "content": next_item["description"]}.items())])` - # `return (prompt, answer)` - # So, first element of item is a tuple of frozensets. - - # Let's simplify for now and refine based on Atropos internals if needed. - # We'll construct the messages list directly in collect_trajectories. - # So get_next_item can return the raw data needed. - return item_data # This will be like {"html": "...", "id": "..."} + return item_data async def collect_trajectories( self, item: Item ) -> Tuple[Optional[ScoredDataGroup], List[Item]]: - # 'item_data' here is what get_next_item returned. original_html = item["html"] system_message_content = ( "You are an expert web developer specializing in accessibility. " @@ -198,262 +178,180 @@ class AccessibilityEnv(BaseEnv): {"role": "user", "content": user_message_content}, ] - try: - chat_completions = await self.server.chat_completion( - messages=messages, - n=self.config.group_size, # Number of completions - # `max_tokens` here is for the *completion* part, not the whole context. - # Your Llama API example used 256. Adjust as needed for HTML output. - max_tokens=1024, # Max tokens for the LLM's response - # temperature=0.7, # Optional: adjust for creativity vs. determinism - # model=self.server_configs[0].model_name # This should be picked up automatically from server_configs - # by the self.server object. - ) - except tenacity.RetryError as retry_err: # Specifically catch RetryError - print( - "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" - ) - print(f"[{self.name}] TENACITY RETRY ERROR during chat_completion call:") - print(f"[{self.name}] RetryError Details: {retry_err}") - # ... and the response details if available on 'e' ... - original_exception = None - if retry_err.last_attempt: - if retry_err.last_attempt.failed: - original_exception = retry_err.last_attempt.exception() - print( - f"[{self.name}] Last attempt failed. Original exception that caused retries:" - ) - print(f"[{self.name}] Type: {type(original_exception)}") - print( - f"[{self.name}] Args: {original_exception.args if original_exception else 'N/A'}" - ) - print( - f"[{self.name}] Full Str: {str(original_exception)}" - ) # More direct string representation - else: - # This case is unusual for a RetryError due to failure - print( - f"""[{self.name}] Last attempt recorded but did not 'fail'. - Result: {retry_err.last_attempt.result()}""" - ) - else: - print( - f"""[{self.name}] Could not get 'last_attempt' details from - RetryError object. Raw RetryError: {retry_err}""" - ) - - # Now, if we have the original_exception, try to get more details (like HTTP response) - if original_exception: - # Check if the original exception is an OpenAI/HTTPX style error - # by looking for a 'response' attribute. - if ( - hasattr(original_exception, "response") - and original_exception.response is not None - ): - response_obj = original_exception.response - status_code_text = "Status code N/A" - response_content_text = "Response content N/A" - - if hasattr(response_obj, "status_code"): - status_code_text = str(response_obj.status_code) - - print( - f"[{self.name}] Underlying API Response Status Code: {status_code_text}" - ) - - # Try to get JSON content first (common for API errors) - if hasattr(response_obj, "json") and callable(response_obj.json): - try: - response_json_parsed = ( - response_obj.json() - ) # Note: this might need to be awaited if response_obj.json is async - # but typically in an exception, it's already processed. - print( - f"[{self.name}] Underlying API Response JSON: {response_json_parsed}" - ) - except Exception as json_e_inner: - print( - f"[{self.name}] Could not parse underlying API response as JSON: {json_e_inner}" - ) - # Fallback to text if JSON parsing fails - if hasattr(response_obj, "text"): - response_content_text = response_obj.text - print( - f"[{self.name}] Underlying API Response Text: {response_content_text}" - ) - elif hasattr(response_obj, "content"): # often bytes - try: - response_content_text = ( - response_obj.content.decode() - ) - print( - f"""[{self.name}] Underlying API Response - Content (decoded): {response_content_text}""" - ) - except Exception: - response_content_text = str(response_obj.content) - print( - f"""[{self.name}] Underlying API Response Content - (raw bytes as str): {response_content_text}""" - ) - # If no json() method, try .text or .content directly - elif hasattr(response_obj, "text"): - response_content_text = response_obj.text - print( - f"[{self.name}] Underlying API Response Text: {response_content_text}" - ) - elif hasattr(response_obj, "content"): - try: - response_content_text = response_obj.content.decode() - print( - f"[{self.name}] Underlying API Response Content (decoded): {response_content_text}" - ) - except Exception: - response_content_text = str(response_obj.content) - print( - f"""[{self.name}] Underlying API Response Content - (raw bytes as str): {response_content_text}""" - ) - - print( - "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" - ) - print( - f"[{self.name}] Messages that were sent during the attempt resulting in RetryError: {messages}" - ) - return None, [] + chat_completions = await self.server.chat_completion( + messages=messages, + n=self.config.group_size, + max_tokens=1024, + ) to_score_inputs = [] - for choice in chat_completions.choices: - llm_response_content = choice.message.content - # The 'messages' to store for scoring/tokenization should represent the full exchange - # that led to this specific llm_response_content. - # This includes the original system and user messages, and the assistant's response. - full_exchange_messages = messages + [ - {"role": "assistant", "content": llm_response_content} - ] - to_score_inputs.append( - { - "full_exchange_messages": full_exchange_messages, # For tokenization - "llm_modified_html": llm_response_content, # For direct scoring - "original_html_info": item, # To know what to check against - } - ) - - # The `score` method in Atropos expects a list where each element typically is - # (messages_tuple_for_tokenization, original_item_metadata_for_scoring_logic) - # We need to adapt `to_score_inputs` to what `self.score` will expect. - # Let's define that `self.score` will take this list of dicts directly. - # The `collect_trajectories` from the blog post returns `to_postprocess, to_backlog` - # where `to_postprocess` is the output of `self.score`. + if chat_completions is not None: + for choice in chat_completions.choices: + llm_response_content = choice.message.content + full_exchange_messages = messages + [ + {"role": "assistant", "content": llm_response_content} + ] + to_score_inputs.append( + { + "full_exchange_messages": full_exchange_messages, + "llm_modified_html": llm_response_content, + "original_html_info": item, + } + ) scored_data_group = await self.score(to_score_inputs) return scored_data_group, [] # Assuming no backlog for now - async def score( - self, rollout_group_data: List[dict] - ) -> Optional[ScoredDataGroup]: # Return type is still ScoredDataGroup + async def score(self, rollout_group_data: List[dict]) -> Optional[ScoredDataGroup]: print(f"[{self.name}] Scoring {len(rollout_group_data)} rollouts...") - all_tokens: List[List[int]] = [] - all_masks: List[List[int]] = [] - all_scores: List[float] = [] - # For TypedDict, optional fields that are not provided will simply not be keys in the dictionary. - # However, if we want to include them as None, we can. Let's prepare for that. - all_advantages: Optional[List[List[float]]] = ( - None # Or initialize as [] if you might populate it - ) - all_ref_logprobs: Optional[List[List[float]]] = None # Or initialize as [] - all_messages_for_trainer: Optional[List[List[Dict]]] = ( - None # Assuming Message is also a dict-like structure or TypedDict - ) + # Initialize lists to store data for all successfully processed items in the batch + final_tokens_batch: List[List[int]] = [] + final_masks_batch: List[List[int]] = [] + final_scores_batch: List[float] = [] + final_concatenated_dialogues_batch: List[str] = [] + + # Optional fields for ScoredDataGroup, will remain None for this basic setup + all_advantages: Optional[List[List[float]]] = None + all_ref_logprobs: Optional[List[List[float]]] = None for data_item in rollout_group_data: - llm_html = data_item["llm_modified_html"] + llm_html_str = data_item["llm_modified_html"] original_info = data_item["original_html_info"] + full_exchange_messages_list_of_dicts = data_item[ + "full_exchange_messages" + ] # This is List[Dict[str, str]] - current_score = -1.0 - if "" in original_info["html"] and "for=" in llm_html: - current_score = 1.0 + current_item_score = 0.0 + num_issues_actually_fixed = 0 + issues_expected_to_fix = original_info.get("issues_to_fix", []) + num_issues_targeted = len(issues_expected_to_fix) + soup: Optional[BeautifulSoup] = None + can_proceed_with_rule_checks = False try: - # Ensure self.tokenizer is initialized in __init__ or setup - if not hasattr(self, "tokenizer") or self.tokenizer is None: - print(f"[{self.name}] Error: Tokenizer not initialized.") - # Attempt to initialize it here if it makes sense, or ensure it's done in setup() - # from transformers import AutoTokenizer - # self.tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_name, trust_remote_code=True) - # This is a fallback, better to ensure it's in setup() - # For now, let's assume it's there. If not, this will fail earlier or be caught by linter. - pass # Assuming tokenizer is initialized + soup = BeautifulSoup(llm_html_str, "lxml") + can_proceed_with_rule_checks = True + except Exception as e: + print( + f"[{self.name}] Item {original_info.get('id', 'N/A')}: Could not parse LLM output as HTML: {e}" + ) + if can_proceed_with_rule_checks and soup is not None: + for rule_instance in self.accessibility_rules: + if rule_instance.issue_key in issues_expected_to_fix: + try: + if rule_instance.check(soup, original_info): + num_issues_actually_fixed += 1 + print( + f"""[{self.name}] Item {original_info.get('id', 'N/A')}: + Rule '{rule_instance.issue_key}' PASSED.""" + ) + else: + print( + f"""[{self.name}] Item {original_info.get('id', 'N/A')}: + Rule '{rule_instance.issue_key}' FAILED.""" + ) + except Exception as rule_e: + print( + f"""[{self.name}] Item {original_info.get('id', 'N/A')}: + Error executing rule '{rule_instance.issue_key}': {rule_e}""" + ) + + # Determine score based on fixes and parseability + if num_issues_targeted > 0: + if not can_proceed_with_rule_checks: # Parsing failed + current_item_score = ( + -1.0 * num_issues_targeted + ) # Penalize per targeted issue if unparseable + elif num_issues_actually_fixed == num_issues_targeted: + current_item_score = 1.0 # All targeted issues fixed + elif ( + num_issues_actually_fixed > 0 + ): # Some, but not all, targeted issues fixed + current_item_score = 0.8 * ( + num_issues_actually_fixed / num_issues_targeted + ) + else: # Parseable, but no targeted issues fixed + current_item_score = -0.5 + else: # No issues were targeted for this item (e.g., input was considered good by dataset design) + if ( + not can_proceed_with_rule_checks + ): # LLM made a good input unparseable + current_item_score = -1.0 + else: # Parseable, and no issues were targeted (good input remained good) + current_item_score = 0.0 # Neutral score + + # Tokenization + try: + if not self.tokenizer: + raise ValueError("Tokenizer not initialized.") tokenized_output = tokenize_for_trainer( - self.tokenizer, # Make sure self.tokenizer is loaded, e.g., in setup() - data_item["full_exchange_messages"], + self.tokenizer, full_exchange_messages_list_of_dicts ) except Exception as e: - print(f"[{self.name}] Error during tokenization: {e}. Skipping item.") - continue + print( + f"""[{self.name}] Error during tokenization for item + {original_info.get('id', 'N/A')}: {e}. Skipping this item.""" + ) + continue # Skip to the next data_item in rollout_group_data if "tokens" not in tokenized_output or "masks" not in tokenized_output: print( - f"[{self.name}] Warning: Tokenization did not return tokens/masks for an item. Skipping." + f"""[{self.name}] Tokenization did not produce 'tokens' or + 'masks' for item {original_info.get('id', 'N/A')}. Skipping this item.""" ) - continue + continue # Skip to the next data_item - all_tokens.append(tokenized_output["tokens"]) - all_masks.append(tokenized_output["masks"]) - all_scores.append(current_score) + # If we reach here, scoring and tokenization for the current item were successful + final_tokens_batch.append(tokenized_output["tokens"]) + final_masks_batch.append(tokenized_output["masks"]) + final_scores_batch.append(current_item_score) - # If you were to populate optional fields, you'd do it here. For example: - # if "advantages" in tokenized_output: # Fictional example - # if all_advantages is None: all_advantages = [] - # all_advantages.append(tokenized_output["advantages"]) + if self.config.include_messages: + formatted_message_log = "".join( + f"{msg_dict['role']}: {msg_dict['content']}\n" + for msg_dict in full_exchange_messages_list_of_dicts + ) + final_concatenated_dialogues_batch.append(formatted_message_log.strip()) - if not all_scores: - print(f"[{self.name}] No valid items to score, returning None.") + # After processing all items in rollout_group_data + if ( + not final_scores_batch + ): # If all items were skipped (e.g., due to tokenization errors) + print( + f"""[{self.name}] No valid items to include in ScoredDataGroup + after processing all rollouts, returning None.""" + ) return None - # print(f"[{self.name}] Scoring complete. Scores: {all_scores}") # Already printed if successful below - - # Construct the dictionary that conforms to ScoredDataGroup TypedDict - # Mandatory fields: data_to_return: ScoredDataGroup = { - "tokens": all_tokens, - "masks": all_masks, - "scores": all_scores, + "tokens": final_tokens_batch, + "masks": final_masks_batch, + "scores": final_scores_batch, "advantages": all_advantages, "ref_logprobs": all_ref_logprobs, "group_overrides": {}, - "messages": all_messages_for_trainer, + "messages": ( + final_concatenated_dialogues_batch + if self.config.include_messages and final_concatenated_dialogues_batch + else None + ), # type: ignore[assignment] "overrides": None, } print( - f"""[{self.name}] Scoring complete. Data to return (first score): - {data_to_return['scores'][0] if data_to_return['scores'] else 'N/A'}""" + f"[{self.name}] Scoring batch complete. Final scores for batch: {data_to_return['scores']}" ) return data_to_return async def evaluate( self, - ): # Optional, might not be needed for hackathon 'process' focus + ): print(f"[{self.name}] Evaluate method called (placeholder).") # Implement evaluation logic if you have a separate test set and metrics pass - # --- Helper methods for scoring --- - # def check_wcag_fixes(self, modified_html: str, original_item_info: dict) -> bool: - # # Placeholder for your actual WCAG checking logic - # # e.g., using BeautifulSoup to parse modified_html - # # and checking against `original_item_info['issues_to_fix']` - # # from bs4 import BeautifulSoup - # # soup = BeautifulSoup(modified_html, 'html.parser') - # # ... logic ... - # return False - if __name__ == "__main__": # This makes your environment runnable with `python accessibility_env.py process` diff --git a/environments/hack0/accessibility_env/accessibility_rules.py b/environments/hack0/accessibility_env/accessibility_rules.py new file mode 100644 index 00000000..28c08fe1 --- /dev/null +++ b/environments/hack0/accessibility_env/accessibility_rules.py @@ -0,0 +1,137 @@ +from abc import ABC, abstractmethod + +from bs4 import BeautifulSoup, Tag # Ensure Tag is imported + + +class AccessibilityRule(ABC): + """Abstract base class for an accessibility rule checker.""" + + @property + @abstractmethod + def issue_key(self) -> str: + """A unique string key identifying the type of issue this rule checks for. + This should match the keys used in the 'issues_to_fix' list in your dataset. + """ + pass + + @abstractmethod + def check(self, soup: BeautifulSoup, original_html_info: dict) -> bool: + """ + Checks the provided HTML (parsed as a BeautifulSoup soup) for compliance + with this specific accessibility rule. + + Args: + soup: The BeautifulSoup object representing the LLM's modified HTML. + original_html_info: A dictionary containing information about the original + HTML snippet, potentially including the original HTML string + if needed for context by some rules. + + Returns: + True if the HTML passes this rule (i.e., the issue is fixed or wasn't present). + False if the HTML fails this rule (i.e., the issue persists or was introduced). + """ + pass + + +class MissingAltTextRule(AccessibilityRule): + @property + def issue_key(self) -> str: + return "missing_alt_text" + + def check(self, soup: BeautifulSoup, original_html_info: dict) -> bool: + # Check if images were relevant in the first place for this item based on original_html_info + # This helps decide if an absence of images now is a failure or if the check is moot. + original_had_images = " str: + return "missing_label_for" # Or "label_association" if you prefer + + def check(self, soup: BeautifulSoup, original_html_info: dict) -> bool: + original_had_labels = "77v2&^5<>@142#>~qhoS+ltB0?Xw}Zc*x3^zVV1TfDppUniDF^_xd3?{^{?82xr2r5R;RpbL|AJs`@P3QU zH2BV^#0LaT4GcpL8S;TC*tjZ>4#F|(Dy%@Ple0YIWU%VB4>#LqoBxBHziR_DCFlxVv zxi>c2irdDaSDh+R0>%8yEy6N3o5tDn2`;XdBqbDVxh^aX6m0c9_J_3j$i8=Efe(q9 z&-=b5zFbqU8xj#r%61l$g{mR2}KXEn6pl#LlQh!^p_$Dtyb+C8$Bsw!28nhhz zNjrCIU!kj1+&)8Th;oYD_5PN#EKiOr?2GQ{yco4ST&4FyUhKgn75<5?amBVgbB2mY zw>+#EKU$jcA;-ejr%{nT$Ftd&mW6quCQ#Ad^>SzJH(X~tw|2j4wpVyGk7U@?*%5e zZ4)AFwo#9W{WZ(osovh?VdiJ5$!WQ}%Q@tNNzpog{GCE)ix^n;*`1Tpl7%izP5mO- zC3zqXv=TClYUVm+cI`W6X82ZUn4l6bJPa(TtP9wfutur+T3RRRKe%9o&t9DrvXzBd zdlP@pfie?cQYdGnmEc0r_MwHK;v8%=_*JC7yC6dy>>rgg=e5TCddBO;BjK1=DG=w# z*~!7B1E-Lk`_3-ub))FWoCt4jka^^GysR{D-0Hsg_&0jRs%xJsuixpMT-SotME#^f zS>%j)@Gs2b;UqowawnYcz2h~C-3PTKY5l2f!62bz!6cH!6?gSmDT|_#>=?JgN-h&u zz{6={H5XSe&Sxld#ZH}?dNA_!*|M4@cOL=uw56xX_lOTZyqwWj*<|yNUMrzI?S+0H%-22O^M>oyX)9EUzt@o z7gZrMc!!t8dLAO}Wr{z62*N_in%Cqs-9!(_iLGk-v!yI3Cu>!DG)gypw`WYE72Cb~ z7P8k`$BSly@2on1&S`b?)~~f#(cV>J>8WK08)BT$SF+=uJBDAjycZcShs9|E>-_0Y z8B^o2uG6FM2stcpFFXkQsej)$E7zt=+PL^yF(;|!$*k$P0&2t(Yn80CQ!~9WRfcG& z_U3A0MuWGi-!y7+JOe)8a*GcprBDkPIT!kX>{wP$)%&TJM{4PEuCiyEoKM>7_~<77 zRSKxaV>SjdF=v}oX)Sr%{09sbat2Rg%>_GaIVCgG&6czVh#4uK%mLFn#|KAQmryHeo3(G2 zVLrw)N15ImR)iAyCL7t0{RTKeDj7Cv)4zWgr@4*^`%PpoA^E$IOh!M zP%OVJk%ht9)m~5%i%3_3l8MG7TM@XLdxQGBcgovaRispF3+Pm3qQa8*;iJ|5paPEEueqlXj0$WkeA{)QAQ$oYLfT#PT>UQ`+vHkX)QfkG3i?Z|*{CR<= zV=bKK2Sx7&azwuxW|>M?twlGRIWvZtmg839Hpu59odmx8rTUVRSS`vEi>T(^BvI)& zo98c{5w>B*M%zc+R;krG-~7k-!PLz{E?dd0mUPpE74R*M%ZDP8Db|}7RuDd&7}prP zZC%Fqjwa9m;Vb}NVAh}0O|c!>%=9Kro>)$NkVcSwZXj*HY}Rc&;d5xdM1z5UBv49u z{k2v5?fhVqU1TnA#xOJ&7Q`)0IV$*-xSS}6A2I6w{cK{$M4aq;bCU?BH0GT6oXmz$ zeAAjCO~J7D{v2xt!?PiZtS{p39HB}X{JV~h-$SA}wuYHvBRrr#D4`4qgcTyK1Z-EG zQfedtvCUrc>vQOGMK`2O&b`UJrDzZh9!V!`i8NI;B|o!aFW-*H26HM zeyh`-BfYq+(7)>`p6w2-8X#x?x{vyhT~d?FGz;0GaJm%nY`N>Dy4_$$EW)wkc~m-q zatfnTzqMnn765ojFEU={Q61Q(dg6+eDm=nJD71fmf-pSqj=VYmHWixiLMoI@sP=J* z13|IDS%}XVpTkc+qpzeA`%@_wkD~j=a4UTurjb#wWt+;F8rBx`WSfvuP9CPR$Sn>_ zdAk`G@4#;K)dbaJy!#~-h50UaVXoNK^}YTGqxu<<H+x}AW$WX?5{O$i zP4^jF$~qxKX*Jw=1RgO%_zaz{E7|QCJ2AE_pPexn8i5tzYO)vRMqTTPy`1iMmhxSQ zERI@b4k4(N4)sRenY)m`-ui@j)V=X+2ynINd6Jbq85Pu7yD1o(7>T4c<#q z6^?ic%9ZI)@moUNto-@8w~&IxTmy4oWx^>h-Sea6EHS2KPp;NS;X7?5Vo8hFB@G%F z^unt<04AS95Ks@p9RBpSu8@P+M=tK^1V~=9!v2?UmEm^4`7J5|`4Q(}5KGi=vAy7- z@yR^v6AkX0Q?dH09B_<5qF`@K-{Gv6Z*Tx^@gp58Bu^$md2)>}xc2w5(1(lUz~LM{ zOo;}Ye7U=ve{ftBA^?3KS^qk1yJfO#U7Jj>comcu7ifZ~cS-F}0eF|iN6Vgr&5PNGAFn zW$OoswA$CnPR}BDiUPSPsL@2Q4CT8~pN!uXX&qu|497?Xu#9BaB7^%kTBU+T3@qIh zK3`##$chxBf2o@*D{h?F!6-cUw$PX7VTaq@qfvzEQ96 zRL3n*y0sOPkeP9Y?pMCZCxoJLMI-D}o-St7b(6wI*@fL2zkv#HvU*7^vJJ0in+_%2 z4jx=Q9KT9Y$iYR#VeSncGu1BUHL{05XhdY5=je9MLO0%@T-z`BKr9z#`KtV<(|*xs zNYva!%*N|{@dHt*(xc_|tVZj>D)FT>+g1-X&Fb>*4u|JN+|8-7n!Wj^_p2LuOtBbN zm3Xl4&fYAw-6P89*vKk%R2$|mV9C5wJ**<$NEWNhj`cx0@0m1b9mNGPy!!?pn3NwhQ;{1Re6VJ z4TVAeR2-GUWOl`Vg?&3I<*b!W^VXA*g0}EVBYV&82r736$0Y6AmqDI~Hdw)t!s@A4 zZdQuI!#B>=LAKis>w?`g2oGbw@E^qNAUm_Mldt-cwXgHpGR`lP%Z+If3BPT3C({kC zqDro;-DFL*%y!d_vM*=n#KbboC&(N}pA9AFo(~BGR0|<#^@M*2jwlsP7vMiU(!1xC zsM^8CFJ665uXJDgamTxQ*9viGNy2yj`lXLNg~v2y(Tj<#uX~ClfZCY!cQZ9y$M*Rj zxKs+#(X__02otkXIvGKQ!gm$fHJSu_lQX!X#cToBlufDJ>)%F-#9;k%CJorKg16wV}$V3v}|CD2)MRmTwU916g&RM@J?EzUF#qm!PYL0~$2Aoy_H9l}?>)<%&*zC_@(dV_kzXG&p=vsfwZ) zd^4w8#_&UL(|l82E*+=>AHn3(;^>6`0d=g~YK~`2;F~$Fv*1Oxc3N=tE$`(YKYNd( zC)*(q7S9r|XDut3q1)o?5BgagDUuqURRi<#o`?Dw=4.0.0 +lxml>=4.0.0 From fedcf7d376053a0bfeb9eb08dfa3ddd3fdc91048 Mon Sep 17 00:00:00 2001 From: Josh Date: Sun, 18 May 2025 17:39:12 -0700 Subject: [PATCH 6/8] Add README --- .../hack0/accessibility_env/README.md | 85 +++++++++++++++++++ 1 file changed, 85 insertions(+) diff --git a/environments/hack0/accessibility_env/README.md b/environments/hack0/accessibility_env/README.md index e69de29b..43ad2bd0 100644 --- a/environments/hack0/accessibility_env/README.md +++ b/environments/hack0/accessibility_env/README.md @@ -0,0 +1,85 @@ +# Accessibility Auto-Fixer Environment for Atropos + +**Team/Author:** Your Team Name / Your Name +**Track:** Objective (WCAG rules are specific and rule-based) + +## Environment Design and Motivation + +### Problem Addressed + +Web accessibility is crucial for ensuring that websites and web applications are usable by everyone, including people with disabilities. Manually auditing and fixing HTML to meet Web Content Accessibility Guidelines (WCAG) is time-consuming and requires specialized knowledge. This Atropos environment fine-tunes an LLM to automatically identify and apply minimal, correct fixes to HTML snippets to improve their WCAG compliance. + +### Why This Is Important + +Automating accessibility improvements reduces effort and cost, leading to more inclusive web experiences. A fine-tuned model can serve as a developer assistant, batch-processor for large codebases, or educational tool. + +### How the Environment Works + +1. **Input:** + - HTML snippets from `data/accessibility_dataset.jsonl` + - Each snippet is tagged with WCAG issues to fix (e.g. `missing_alt_text`, `missing_label_for`) +2. **LLM Interaction:** + - Prompt the model (e.g. GPT-3.5-turbo) to output only the corrected HTML +3. **Scoring (Rule-Based):** + - Define `AccessibilityRule` classes (e.g. `MissingAltTextRule`, `LabelAssociationRule`, `LinkHasTextRule`) + - Parse the LLM’s output with BeautifulSoup + - Check each issue in `issues_to_fix` against the corresponding rule + - Assign a score: + - **+1.0** All targeted issues fixed correctly + - **0.0–0.8** Some but not all issues fixed + - **–0.5** Parseable HTML, but none of the targeted issues fixed + - **–1.0** Unparseable HTML or regressions on targeted issues +4. **Output:** + - Rollouts compatible with Atropos (tokenized prompts/responses, masks, scores) for RL training + +### MVP: Targeted WCAG Criteria + +1. **Images (``):** missing or empty `alt` attributes (WCAG 1.1.1) +2. **Form labels:** improper `