From ab60b30e30927486cae0c125c7f6917668da8c26 Mon Sep 17 00:00:00 2001 From: Asankhaya Sharma Date: Thu, 6 Nov 2025 09:09:08 +0800 Subject: [PATCH 1/4] Add IMO-Bench evaluation scripts for answers and proofs Introduces two new scripts: eval_imobench_answer.py for evaluating short-answer mathematical problems from the AnswerBench dataset, and eval_imobench_proof.py for evaluating rigorous proof problems from the ProofBench dataset. Both scripts support model evaluation, result saving, and detailed performance analysis. --- scripts/eval_imobench_answer.py | 407 ++++++++++++++++++++++ scripts/eval_imobench_proof.py | 600 ++++++++++++++++++++++++++++++++ 2 files changed, 1007 insertions(+) create mode 100644 scripts/eval_imobench_answer.py create mode 100644 scripts/eval_imobench_proof.py diff --git a/scripts/eval_imobench_answer.py b/scripts/eval_imobench_answer.py new file mode 100644 index 0000000..f98b27f --- /dev/null +++ b/scripts/eval_imobench_answer.py @@ -0,0 +1,407 @@ +""" +Evaluation script for IMO-Bench AnswerBench dataset (400 problems) +Tests model performance on short-answer mathematical problems across 4 categories +""" + +import argparse +import json +import os +import logging +import time +import re +import pandas as pd +import requests +from typing import List, Dict, Optional +from datetime import datetime +from openai import OpenAI +from tqdm import tqdm + +# Add sys path to import optillm modules +import sys +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from optillm.utils.answer_extraction import extract_answer + +# Configure logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +# Dataset URL +ANSWERBENCH_URL = "https://raw.githubusercontent.com/google-deepmind/superhuman/main/imobench/answerbench.csv" + +SYSTEM_PROMPT = '''You are solving IMO-Bench mathematical problems across algebra, combinatorics, geometry, and number theory. + +Key requirements: +1. **Clear reasoning**: Show your work step-by-step +2. **Mathematical rigor**: Justify each step logically +3. **Final answer**: Clearly state your final answer in \\boxed{} format + +For different problem types: +- Algebra: Handle functional equations, polynomials, inequalities +- Combinatorics: Use counting techniques, pigeonhole principle, extremal arguments +- Geometry: Apply coordinate systems, trigonometry, or synthetic methods +- Number Theory: Use divisibility, modular arithmetic, prime factorization + +Always conclude with your final answer in \\boxed{your_answer} format.''' + + +def download_answerbench() -> pd.DataFrame: + """ + Download and parse the AnswerBench CSV dataset + """ + logger.info("Downloading AnswerBench dataset...") + try: + response = requests.get(ANSWERBENCH_URL, timeout=30) + response.raise_for_status() + + # Save to temp file and load with pandas + temp_file = "/tmp/answerbench.csv" + with open(temp_file, 'wb') as f: + f.write(response.content) + + df = pd.read_csv(temp_file) + logger.info(f"Loaded {len(df)} problems from AnswerBench") + return df + + except Exception as e: + logger.error(f"Error downloading AnswerBench: {e}") + raise + + +def normalize_answer(answer: str) -> str: + """ + Normalize answer for comparison + """ + if answer is None: + return "" + + # Convert to string and lowercase + answer = str(answer).strip().lower() + + # Remove extra whitespace + answer = re.sub(r'\s+', ' ', answer) + + # Remove common LaTeX formatting + answer = answer.replace('\\', '') + answer = answer.replace('$', '') + answer = answer.replace('{', '').replace('}', '') + + return answer + + +def compare_answers(predicted: str, ground_truth: str) -> bool: + """ + Compare predicted answer with ground truth + Uses both exact match and semantic equivalence + """ + if not predicted or not ground_truth: + return False + + # Normalize both answers + pred_norm = normalize_answer(predicted) + truth_norm = normalize_answer(ground_truth) + + # Exact match after normalization + if pred_norm == truth_norm: + return True + + # Check if one contains the other (for cases like "4" in "c = 4") + if pred_norm in truth_norm or truth_norm in pred_norm: + return True + + # Try numeric comparison if possible + try: + pred_num = float(re.sub(r'[^0-9.-]', '', predicted)) + truth_num = float(re.sub(r'[^0-9.-]', '', ground_truth)) + if abs(pred_num - truth_num) < 1e-6: + return True + except (ValueError, TypeError): + pass + + return False + + +def extract_answer_from_solution(solution: str, problem_id: str = None) -> str: + """ + Extract the final answer from a solution + """ + if not solution: + return None + + # Try unified answer extraction first + try: + extracted = extract_answer(solution, problem_type="math") + if extracted: + return str(extracted) + except Exception as e: + logger.debug(f"Unified extraction failed: {e}") + + # Look for boxed answers + boxed_pattern = r'\\boxed\{([^}]+)\}' + boxed_matches = re.findall(boxed_pattern, solution) + if boxed_matches: + return boxed_matches[-1].strip() + + # Look for "final answer" or "answer:" sections + answer_patterns = [ + r'final answer[:\s]*([^\n]+)', + r'answer[:\s]*([^\n]+)', + r'therefore[:\s]*([^\n]+)', + r'thus[:\s]*([^\n]+)' + ] + + solution_lower = solution.lower() + for pattern in answer_patterns: + matches = re.findall(pattern, solution_lower) + if matches: + return matches[-1].strip() + + return None + + +def get_llm_response(problem: str, model: str, client: OpenAI, extra_body: dict = None, timeout: int = 300) -> Dict: + """ + Get response from the LLM for a mathematical problem + """ + try: + kwargs = {} + if extra_body: + kwargs["extra_body"] = extra_body + + response = client.with_options(timeout=timeout).chat.completions.create( + model=model, + messages=[ + {"role": "system", "content": SYSTEM_PROMPT}, + {"role": "user", "content": problem} + ], + max_tokens=16000, + temperature=0.1, + **kwargs + ) + + solution_text = response.choices[0].message.content.strip() + reasoning_tokens = getattr(response.usage, 'reasoning_tokens', 0) + total_tokens = response.usage.total_tokens if hasattr(response.usage, 'total_tokens') else 0 + + return { + "solution": solution_text, + "reasoning_tokens": reasoning_tokens, + "total_tokens": total_tokens, + "success": True + } + + except Exception as e: + logger.error(f"Error getting LLM response: {e}") + return { + "solution": f"Error: {str(e)}", + "reasoning_tokens": 0, + "total_tokens": 0, + "success": False + } + + +def save_result(filename: str, result: Dict): + """Save a single result to the results file with incremental updates""" + results = [] + if os.path.exists(filename): + try: + with open(filename, 'r') as f: + results = json.load(f) + except (FileNotFoundError, json.JSONDecodeError): + results = [] + + results.append(result) + + with open(filename, 'w') as f: + json.dump(results, f, indent=2) + + +def load_existing_results(filename: str) -> List[Dict]: + """Load existing results from file if it exists""" + try: + with open(filename, 'r') as f: + return json.load(f) + except FileNotFoundError: + return [] + + +def analyze_results(results: List[Dict]): + """Analyze and print comprehensive statistics""" + if not results: + print("No results to analyze") + return + + total_problems = len(results) + correct = sum(1 for r in results if r.get('is_correct', False)) + + print("\n" + "="*80) + print("IMO-Bench AnswerBench Evaluation Results") + print("="*80) + print(f"Total problems: {total_problems}") + print(f"Correct: {correct}") + print(f"Accuracy: {correct/total_problems*100:.2f}%") + + # Category breakdown + categories = {} + for r in results: + cat = r.get('category', 'Unknown') + if cat not in categories: + categories[cat] = {'total': 0, 'correct': 0} + categories[cat]['total'] += 1 + if r.get('is_correct', False): + categories[cat]['correct'] += 1 + + print("\nPerformance by Category:") + print("-" * 60) + for cat, stats in sorted(categories.items()): + acc = stats['correct'] / stats['total'] * 100 if stats['total'] > 0 else 0 + print(f"{cat:20s}: {stats['correct']:3d}/{stats['total']:3d} ({acc:5.1f}%)") + + # Difficulty breakdown if available + difficulties = {} + for r in results: + diff = r.get('difficulty', 'Unknown') + if diff and diff != 'Unknown': + if diff not in difficulties: + difficulties[diff] = {'total': 0, 'correct': 0} + difficulties[diff]['total'] += 1 + if r.get('is_correct', False): + difficulties[diff]['correct'] += 1 + + if difficulties: + print("\nPerformance by Difficulty:") + print("-" * 60) + for diff, stats in sorted(difficulties.items()): + acc = stats['correct'] / stats['total'] * 100 if stats['total'] > 0 else 0 + print(f"{diff:20s}: {stats['correct']:3d}/{stats['total']:3d} ({acc:5.1f}%)") + + # Token statistics + total_tokens = sum(r['response'].get('total_tokens', 0) for r in results) + reasoning_tokens = sum(r['response'].get('reasoning_tokens', 0) for r in results) + + print("\nToken Statistics:") + print("-" * 60) + print(f"Total tokens: {total_tokens:,}") + print(f"Reasoning tokens: {reasoning_tokens:,}") + print(f"Avg tokens per problem: {total_tokens/total_problems:.0f}") + + # Time statistics + total_time = sum(r.get('solve_time_seconds', 0) for r in results) + print(f"\nTotal solve time: {total_time:.1f}s ({total_time/60:.1f} minutes)") + print(f"Avg time per problem: {total_time/total_problems:.1f}s") + + print("="*80 + "\n") + + +def main(): + parser = argparse.ArgumentParser(description="Evaluate on IMO-Bench AnswerBench") + parser.add_argument("--model", type=str, required=True, + help="Model to use (e.g., google/gemini-2.5-flash-preview-09-2025)") + parser.add_argument("--base-url", type=str, default="http://localhost:8001/v1", + help="Base URL for OptiLLM server") + parser.add_argument("--timeout", type=int, default=300, + help="Timeout in seconds for each problem") + parser.add_argument("--limit", type=int, default=None, + help="Limit number of problems to evaluate (for testing)") + parser.add_argument("--categories", type=str, default=None, + help="Comma-separated list of categories to evaluate (e.g., 'Algebra,Geometry')") + + args = parser.parse_args() + + # Initialize OpenAI client + client = OpenAI(api_key="optillm", base_url=args.base_url) + + # Setup results directory + os.makedirs("results", exist_ok=True) + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + + # Determine if using MARS approach + is_mars = args.model.startswith("mars-") + approach_name = "mars" if is_mars else "baseline" + model_name = args.model.replace("mars-", "") if is_mars else args.model + + results_file = f"results/imobench_answer_{approach_name}_{model_name.replace('/', '_')}_{timestamp}.json" + + # Download dataset + df = download_answerbench() + + # Filter by categories if specified + if args.categories: + selected_cats = [c.strip() for c in args.categories.split(',')] + df = df[df['Category'].isin(selected_cats)] + print(f"Filtered to categories: {selected_cats}") + + # Limit problems if specified + if args.limit: + df = df.head(args.limit) + + print(f"\nEvaluating {len(df)} AnswerBench problems") + print(f"Model: {args.model}") + print(f"Approach: {approach_name}") + print(f"Results will be saved to: {results_file}\n") + + # Evaluate each problem + for idx, row in tqdm(df.iterrows(), total=len(df), desc="Solving problems"): + problem_id = row.get('Problem ID', f'problem_{idx}') + problem_text = row['Problem'] + ground_truth = row['Short Answer'] + category = row.get('Category', 'Unknown') + subcategory = row.get('Subcategory', '') + difficulty = row.get('Difficulty', '') + + logger.info(f"Evaluating {problem_id}: {category}") + + start_time = time.time() + + # Get LLM response + response = get_llm_response( + problem_text, + args.model, + client, + extra_body=None, # Model prefix handles MARS + timeout=args.timeout + ) + + solve_time = time.time() - start_time + + # Extract answer + extracted_answer = extract_answer_from_solution(response['solution'], problem_id) + + # Compare with ground truth + is_correct = compare_answers(extracted_answer, ground_truth) + + # Compile result + result = { + "timestamp": datetime.now().isoformat(), + "model": args.model, + "approach": approach_name, + "problem_id": problem_id, + "category": category, + "subcategory": subcategory, + "difficulty": difficulty, + "problem": problem_text, + "ground_truth": ground_truth, + "extracted_answer": extracted_answer, + "is_correct": is_correct, + "response": response, + "solve_time_seconds": solve_time + } + + # Save result immediately + save_result(results_file, result) + + status = "✓" if is_correct else "✗" + logger.info(f"{status} {problem_id} - Answer: {extracted_answer}") + + # Load and analyze all results + print("\n" + "="*80) + print("Evaluation Complete!") + print("="*80) + + results = load_existing_results(results_file) + analyze_results(results) + + print(f"Results saved to: {results_file}") + + +if __name__ == "__main__": + main() diff --git a/scripts/eval_imobench_proof.py b/scripts/eval_imobench_proof.py new file mode 100644 index 0000000..7a7ae2b --- /dev/null +++ b/scripts/eval_imobench_proof.py @@ -0,0 +1,600 @@ +""" +Evaluation script for IMO-Bench ProofBench dataset (60 problems) +Tests model performance on rigorous mathematical proof construction +Uses IMO25-style verification system for grading +""" + +import argparse +import json +import os +import logging +import time +import re +import pandas as pd +import requests +from typing import List, Dict, Optional +from datetime import datetime +from openai import OpenAI +from tqdm import tqdm + +# Add sys path to import optillm modules +import sys +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +# Configure logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +# Dataset URL +PROOFBENCH_URL = "https://raw.githubusercontent.com/google-deepmind/superhuman/main/imobench/proofbench.csv" + +SYSTEM_PROMPT = '''You are solving IMO-level mathematical proof problems. These require rigorous, complete proofs. + +Key requirements: +1. **Complete proofs**: Provide rigorous, step-by-step mathematical proofs +2. **Mathematical rigor**: Every step must be logically justified +3. **Clear structure**: Organize your solution with clear logical flow +4. **Proper notation**: Use correct mathematical notation and formatting +5. **Verification**: Double-check your reasoning and conclusions + +For different problem types: +- Functional equations: Consider injectivity, surjectivity, and special values +- Geometry: Use coordinate systems, trigonometry, or synthetic methods as appropriate +- Number theory: Apply divisibility, modular arithmetic, and prime factorization +- Combinatorics: Use counting techniques, pigeonhole principle, and extremal arguments +- Algebra: Handle polynomials, inequalities, and algebraic structures + +Provide a complete, rigorous proof that addresses all aspects of the problem.''' + +VERIFICATION_PROMPT = """You are an expert mathematician and IMO grader. Your task is to rigorously verify this mathematical solution. + +**Grading Scale (0-7 points):** +- **7 points**: Complete, rigorous, correct proof +- **6 points**: Correct approach, minor gaps or notation issues +- **5 points**: Mostly correct, some gaps in rigor +- **3-4 points**: Significant progress, partial solution +- **1-2 points**: Some correct ideas, incomplete +- **0 points**: No progress or completely wrong + +**Verification Instructions:** +1. Check logical correctness of each step +2. Verify mathematical rigor and completeness +3. Identify any critical errors or gaps +4. Assess proof structure and clarity + +**Problem:** +{problem} + +**Solution to verify:** +{solution} + +Provide your assessment in the following format: + +**SCORE:** [0-7] +**VERDICT:** [Correct/Partially Correct/Incorrect] +**REASONING:** [Detailed explanation of your assessment] +**CRITICAL ERRORS:** [List any critical errors found, or "None"] +**GAPS:** [List any gaps in rigor, or "None"]""" + + +def download_proofbench() -> pd.DataFrame: + """ + Download and parse the ProofBench CSV dataset + """ + logger.info("Downloading ProofBench dataset...") + try: + response = requests.get(PROOFBENCH_URL, timeout=30) + response.raise_for_status() + + # Save to temp file and load with pandas + temp_file = "/tmp/proofbench.csv" + with open(temp_file, 'wb') as f: + f.write(response.content) + + df = pd.read_csv(temp_file) + logger.info(f"Loaded {len(df)} problems from ProofBench") + return df + + except Exception as e: + logger.error(f"Error downloading ProofBench: {e}") + raise + + +def verify_proof(problem: str, solution: str, grading_guidelines: str, model: str, client: OpenAI) -> Dict: + """ + Verify a proof using IMO25-style two-stage verification + Returns score on 0-7 scale and detailed assessment + """ + try: + # Format verification prompt + verification_text = VERIFICATION_PROMPT.format( + problem=problem, + solution=solution + ) + + # Add grading guidelines if available + if grading_guidelines and pd.notna(grading_guidelines): + verification_text += f"\n\n**Grading Guidelines:**\n{grading_guidelines}" + + # Get verification response + response = client.chat.completions.create( + model=model, + messages=[ + {"role": "system", "content": "You are an expert IMO grader. Provide rigorous assessment."}, + {"role": "user", "content": verification_text} + ], + max_tokens=4000, + temperature=0.1 + ) + + verification_response = response.choices[0].message.content.strip() + + # Extract score + score_match = re.search(r'\*\*SCORE:\*\*\s*(\d+)', verification_response) + score = int(score_match.group(1)) if score_match else 0 + + # Extract verdict + verdict_match = re.search(r'\*\*VERDICT:\*\*\s*([^\n]+)', verification_response) + verdict = verdict_match.group(1).strip() if verdict_match else "Unknown" + + # Determine if correct (7 points = full marks) + is_correct = (score == 7) + + # Check for critical errors + errors_match = re.search(r'\*\*CRITICAL ERRORS:\*\*\s*([^\n]+)', verification_response) + has_critical_errors = errors_match and "None" not in errors_match.group(1) if errors_match else False + + return { + "score": score, + "verdict": verdict, + "is_correct": is_correct, + "has_critical_errors": has_critical_errors, + "verification_response": verification_response, + "success": True + } + + except Exception as e: + logger.error(f"Error in proof verification: {e}") + return { + "score": 0, + "verdict": "Error", + "is_correct": False, + "has_critical_errors": True, + "verification_response": f"Verification error: {str(e)}", + "success": False + } + + +def extract_solution_quality(solution: str) -> Dict: + """ + Analyze the quality of a mathematical proof + """ + analysis = { + "has_proof_structure": False, + "uses_mathematical_notation": False, + "has_logical_steps": False, + "addresses_cases": False, + "has_conclusion": False, + "length_score": 0 + } + + if not solution: + return analysis + + solution_lower = solution.lower() + + # Check for proof structure + proof_keywords = ["proof:", "solution:", "we prove", "to show", "suppose", "assume", "let", "consider"] + if any(keyword in solution_lower for keyword in proof_keywords): + analysis["has_proof_structure"] = True + + # Check for mathematical notation + math_patterns = [r'\$.*\$', r'\\[a-zA-Z]+', r'\\geq', r'\\leq', r'\\in', r'\\sum', r'\\prod'] + if any(re.search(pattern, solution) for pattern in math_patterns): + analysis["uses_mathematical_notation"] = True + + # Check for logical flow + logical_words = ["therefore", "thus", "hence", "consequently", "since", "because", "implies"] + logical_count = sum(1 for word in logical_words if word in solution_lower) + if logical_count >= 3: + analysis["has_logical_steps"] = True + + # Check for case analysis + case_words = ["case", "if", "when", "suppose"] + case_count = sum(1 for word in case_words if word in solution_lower) + if case_count >= 2: + analysis["addresses_cases"] = True + + # Check for conclusion + conclusion_words = ["therefore", "thus", "q.e.d", "qed", "proven", "concluded"] + if any(word in solution_lower for word in conclusion_words): + analysis["has_conclusion"] = True + + # Length score (normalized) + analysis["length_score"] = min(len(solution) / 2000, 1.0) + + return analysis + + +def get_llm_response(problem: str, model: str, client: OpenAI, extra_body: dict = None, timeout: int = 600) -> Dict: + """ + Get response from the LLM for a proof problem + """ + try: + kwargs = {} + if extra_body: + kwargs["extra_body"] = extra_body + + response = client.with_options(timeout=timeout).chat.completions.create( + model=model, + messages=[ + {"role": "system", "content": SYSTEM_PROMPT}, + {"role": "user", "content": problem} + ], + max_tokens=64000, # Extended for complex proofs + temperature=0.1, + **kwargs + ) + + solution_text = response.choices[0].message.content.strip() + reasoning_tokens = getattr(response.usage, 'reasoning_tokens', 0) + total_tokens = response.usage.total_tokens if hasattr(response.usage, 'total_tokens') else 0 + + return { + "solution": solution_text, + "reasoning_tokens": reasoning_tokens, + "total_tokens": total_tokens, + "success": True + } + + except Exception as e: + logger.error(f"Error getting LLM response: {e}") + return { + "solution": f"Error: {str(e)}", + "reasoning_tokens": 0, + "total_tokens": 0, + "success": False + } + + +def save_result(filename: str, result: Dict): + """Save a single result with incremental updates""" + results = [] + if os.path.exists(filename): + try: + with open(filename, 'r') as f: + results = json.load(f) + except (FileNotFoundError, json.JSONDecodeError): + results = [] + + results.append(result) + + with open(filename, 'w') as f: + json.dump(results, f, indent=2) + + +def load_existing_results(filename: str) -> List[Dict]: + """Load existing results from file""" + try: + with open(filename, 'r') as f: + return json.load(f) + except FileNotFoundError: + return [] + + +def calculate_subset_scores(results: List[Dict]) -> Dict: + """ + Calculate full credit scores for various subsets (Novel, IMO 2024, USAMO 2025) + Returns dictionary with subset names and their (solved, total, percentage) tuples + """ + subsets = { + 'Novel': {'full': 0, 'total': 0}, + 'IMO 2024': {'full': 0, 'total': 0}, + 'USAMO 2025': {'full': 0, 'total': 0} + } + + for r in results: + source = r.get('source', '') + score = r['verification'].get('score', 0) + is_full = (score == 7) + + # Categorize by source + if 'Novel Problem' in source: + subsets['Novel']['total'] += 1 + if is_full: + subsets['Novel']['full'] += 1 + elif 'IMO 2024' in source: + subsets['IMO 2024']['total'] += 1 + if is_full: + subsets['IMO 2024']['full'] += 1 + elif 'USAMO 2025' in source: + subsets['USAMO 2025']['total'] += 1 + if is_full: + subsets['USAMO 2025']['full'] += 1 + + # Calculate percentages + subset_stats = {} + for name, counts in subsets.items(): + total = counts['total'] + full = counts['full'] + pct = (full / total * 100) if total > 0 else 0 + subset_stats[name] = (full, total, pct) + + return subset_stats + + +def analyze_results(results: List[Dict]): + """Analyze and print comprehensive statistics with full credit prioritized""" + if not results: + print("No results to analyze") + return + + total_problems = len(results) + full_marks = sum(1 for r in results if r['verification'].get('score', 0) == 7) + partial_credit = sum(1 for r in results if 1 <= r['verification'].get('score', 0) <= 6) + no_credit = total_problems - full_marks - partial_credit + + total_score = sum(r['verification'].get('score', 0) for r in results) + avg_score = total_score / total_problems + + print("\n" + "="*80) + print("IMO-Bench ProofBench Evaluation Results") + print("="*80) + + # ======================================================================== + # SECTION 1: FULL CREDIT SCORES (PRIMARY METRIC) + # ======================================================================== + print("\n" + "="*80) + print("FULL CREDIT SCORES (7/7 = Solved) - PRIMARY METRIC") + print("="*80) + print(f"\nOverall: {full_marks}/{total_problems} = {full_marks/total_problems*100:.1f}%") + + # Basic vs Advanced breakdown (full credit only) + basic_full = sum(1 for r in results if 'Basic' in r.get('problem_id', '') and r['verification'].get('score', 0) == 7) + basic_total = sum(1 for r in results if 'Basic' in r.get('problem_id', '')) + adv_full = sum(1 for r in results if 'Advanced' in r.get('problem_id', '') and r['verification'].get('score', 0) == 7) + adv_total = sum(1 for r in results if 'Advanced' in r.get('problem_id', '')) + + print(f"\nBasic problems: {basic_full}/{basic_total} = {basic_full/basic_total*100 if basic_total > 0 else 0:.1f}%") + print(f"Advanced problems: {adv_full}/{adv_total} = {adv_full/adv_total*100 if adv_total > 0 else 0:.1f}%") + + # ======================================================================== + # SECTION 2: SUBSET BREAKDOWN (Novel, IMO 2024, USAMO 2025) + # ======================================================================== + subset_stats = calculate_subset_scores(results) + + if any(total > 0 for _, total, _ in subset_stats.values()): + print("\n" + "-"*80) + print("Subset Breakdown (Full Credit Only):") + print("-"*80) + for name in ['Novel', 'IMO 2024', 'USAMO 2025']: + full, total, pct = subset_stats[name] + if total > 0: + print(f"{name:15s}: {full}/{total} = {pct:.1f}%") + + # ======================================================================== + # SECTION 3: DETAILED ANALYSIS (Average Scores and Distributions) + # ======================================================================== + print("\n" + "="*80) + print("DETAILED ANALYSIS (Average Scores)") + print("="*80) + print(f"\nAverage score: {avg_score:.2f}/7 ({avg_score/7*100:.1f}%)") + print(f"Full credit (7/7): {full_marks} ({full_marks/total_problems*100:.1f}%)") + print(f"Partial credit (1-6): {partial_credit} ({partial_credit/total_problems*100:.1f}%)") + print(f"No credit (0): {no_credit} ({no_credit/total_problems*100:.1f}%)") + + # Basic vs Advanced (average scores) + basic_scores = [r['verification'].get('score', 0) for r in results if 'Basic' in r.get('problem_id', '')] + adv_scores = [r['verification'].get('score', 0) for r in results if 'Advanced' in r.get('problem_id', '')] + + if basic_scores or adv_scores: + print("\n" + "-"*80) + print("Basic vs Advanced (Average Scores):") + print("-"*80) + if basic_scores: + basic_avg = sum(basic_scores) / len(basic_scores) + print(f"Basic ({len(basic_scores)}): {basic_avg:.2f}/7 ({basic_avg/7*100:.1f}%)") + if adv_scores: + adv_avg = sum(adv_scores) / len(adv_scores) + print(f"Advanced ({len(adv_scores)}): {adv_avg:.2f}/7 ({adv_avg/7*100:.1f}%)") + + # Category breakdown (average scores) + categories = {} + for r in results: + cat = r.get('category', 'Unknown') + if cat not in categories: + categories[cat] = {'total': 0, 'scores': [], 'full': 0} + categories[cat]['total'] += 1 + score = r['verification'].get('score', 0) + categories[cat]['scores'].append(score) + if score == 7: + categories[cat]['full'] += 1 + + if categories: + print("\n" + "-"*80) + print("Performance by Category:") + print("-"*80) + for cat, stats in sorted(categories.items()): + avg = sum(stats['scores']) / len(stats['scores']) if stats['scores'] else 0 + full = stats['full'] + total = stats['total'] + print(f"{cat:20s}: Avg {avg:.2f}/7 ({avg/7*100:.1f}%) | Solved {full}/{total} ({full/total*100:.1f}%)") + + # Level breakdown (average scores) + levels = {} + for r in results: + level = r.get('level', 'Unknown') + if level not in levels: + levels[level] = {'total': 0, 'scores': [], 'full': 0} + levels[level]['total'] += 1 + score = r['verification'].get('score', 0) + levels[level]['scores'].append(score) + if score == 7: + levels[level]['full'] += 1 + + if levels: + print("\n" + "-"*80) + print("Performance by Level:") + print("-"*80) + for level, stats in sorted(levels.items()): + avg = sum(stats['scores']) / len(stats['scores']) if stats['scores'] else 0 + full = stats['full'] + total = stats['total'] + print(f"{level:20s}: Avg {avg:.2f}/7 ({avg/7*100:.1f}%) | Solved {full}/{total} ({full/total*100:.1f}%)") + + # Token statistics + try: + total_tokens = sum(r['response'].get('total_tokens', 0) for r in results) + reasoning_tokens = sum(r['response'].get('reasoning_tokens', 0) for r in results) + + print("\n" + "-"*80) + print("Token Statistics:") + print("-"*80) + print(f"Total tokens: {total_tokens:,}") + print(f"Reasoning tokens: {reasoning_tokens:,}") + print(f"Avg tokens per problem: {total_tokens/total_problems:.0f}") + except (KeyError, TypeError): + pass # Skip token stats if data not available + + # Time statistics + total_time = sum(r.get('solve_time_seconds', 0) for r in results) + print("\n" + "-"*80) + print(f"Total solve time: {total_time:.1f}s ({total_time/60:.1f} minutes, {total_time/3600:.1f} hours)") + print(f"Avg time per problem: {total_time/total_problems:.1f}s") + + print("="*80 + "\n") + + +def main(): + parser = argparse.ArgumentParser(description="Evaluate on IMO-Bench ProofBench") + parser.add_argument("--model", type=str, required=True, + help="Model to use (e.g., google/gemini-2.5-flash-preview-09-2025 or mars-...)") + parser.add_argument("--base-url", type=str, default="http://localhost:8001/v1", + help="Base URL for OptiLLM server") + parser.add_argument("--verifier-model", type=str, default=None, + help="Model to use for verification (defaults to same as solver)") + parser.add_argument("--timeout", type=int, default=600, + help="Timeout in seconds for each problem") + parser.add_argument("--limit", type=int, default=None, + help="Limit number of problems (for testing)") + parser.add_argument("--subset", type=str, default=None, + help="Evaluate only 'basic' or 'advanced' subset") + + args = parser.parse_args() + + # Initialize OpenAI client + client = OpenAI(api_key="optillm", base_url=args.base_url) + + # Verifier model defaults to solver model + verifier_model = args.verifier_model or args.model.replace("mars-", "") + + # Setup results directory + os.makedirs("results", exist_ok=True) + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + + # Determine if using MARS + is_mars = args.model.startswith("mars-") + approach_name = "mars" if is_mars else "baseline" + model_name = args.model.replace("mars-", "") if is_mars else args.model + + results_file = f"results/imobench_proof_{approach_name}_{model_name.replace('/', '_')}_{timestamp}.json" + + # Download dataset + df = download_proofbench() + + # Filter by subset if specified + if args.subset: + if args.subset.lower() == 'basic': + df = df[df['Level'].str.contains('Basic', case=False, na=False)] + elif args.subset.lower() == 'advanced': + df = df[df['Level'].str.contains('Advanced', case=False, na=False)] + print(f"Filtered to {args.subset} subset") + + # Limit problems if specified + if args.limit: + df = df.head(args.limit) + + print(f"\nEvaluating {len(df)} ProofBench problems") + print(f"Model: {args.model}") + print(f"Approach: {approach_name}") + print(f"Verifier: {verifier_model}") + if is_mars: + print("MARS Config: use_thinking_tags=False, answer_extraction_mode='none'") + print(f"Results will be saved to: {results_file}\n") + + # Evaluate each problem + for idx, row in tqdm(df.iterrows(), total=len(df), desc="Solving proof problems"): + problem_id = row.get('Problem ID', f'problem_{idx}') + problem_text = row['Problem'] + reference_solution = row.get('Solution', '') + grading_guidelines = row.get('Grading guidelines', '') + category = row.get('Category', 'Unknown') + level = row.get('Level', 'Unknown') + source = row.get('Source', 'Unknown') + + logger.info(f"Evaluating {problem_id}: {category} ({level})") + + start_time = time.time() + + # Get LLM response (model prefix handles MARS configuration automatically) + response = get_llm_response( + problem_text, + args.model, + client, + extra_body=None, + timeout=args.timeout + ) + + solve_time = time.time() - start_time + + # Verify the proof + verification = verify_proof( + problem_text, + response['solution'], + grading_guidelines, + verifier_model, + client + ) + + # Analyze solution quality + quality = extract_solution_quality(response['solution']) + + # Compile result + result = { + "timestamp": datetime.now().isoformat(), + "model": args.model, + "approach": approach_name, + "verifier_model": verifier_model, + "problem_id": problem_id, + "category": category, + "level": level, + "source": source, + "problem": problem_text, + "reference_solution": reference_solution, + "grading_guidelines": grading_guidelines, + "response": response, + "verification": verification, + "quality": quality, + "solve_time_seconds": solve_time + } + + # Save result immediately + save_result(results_file, result) + + score = verification.get('score', 0) + status = "✓ SOLVED" if score == 7 else f"✗ {score}/7" + logger.info(f"{status} {problem_id}") + + # Load and analyze all results + print("\n" + "="*80) + print("Evaluation Complete!") + print("="*80) + + results = load_existing_results(results_file) + analyze_results(results) + + print(f"Results saved to: {results_file}") + + +if __name__ == "__main__": + main() From 2558d80b89b1e4c7c8f1275fd3194b1c0ac612c7 Mon Sep 17 00:00:00 2001 From: Asankhaya Sharma Date: Sat, 8 Nov 2025 08:47:23 +0800 Subject: [PATCH 2/4] Free up disk space in Docker publish workflows Added a step to remove unused SDKs and prune Docker system volumes in both amd64 and arm64 Docker publish workflows. This helps prevent disk space issues during CI builds. --- .github/workflows/publish-docker-full-amd64.yml | 10 +++++++++- .github/workflows/publish-docker-full-arm64.yml | 10 +++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/.github/workflows/publish-docker-full-amd64.yml b/.github/workflows/publish-docker-full-amd64.yml index 216fe49..5a81b29 100644 --- a/.github/workflows/publish-docker-full-amd64.yml +++ b/.github/workflows/publish-docker-full-amd64.yml @@ -15,7 +15,15 @@ jobs: - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 - + + - name: Free up disk space + run: | + sudo rm -rf /usr/share/dotnet + sudo rm -rf /usr/local/lib/android + sudo rm -rf /opt/ghc + sudo rm -rf /opt/hostedtoolcache/CodeQL + docker system prune -af --volumes + - name: Log in to GitHub Container Registry uses: docker/login-action@v3 with: diff --git a/.github/workflows/publish-docker-full-arm64.yml b/.github/workflows/publish-docker-full-arm64.yml index f3f43e2..6fef662 100644 --- a/.github/workflows/publish-docker-full-arm64.yml +++ b/.github/workflows/publish-docker-full-arm64.yml @@ -18,7 +18,15 @@ jobs: - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 - + + - name: Free up disk space + run: | + sudo rm -rf /usr/share/dotnet + sudo rm -rf /usr/local/lib/android + sudo rm -rf /opt/ghc + sudo rm -rf /opt/hostedtoolcache/CodeQL + docker system prune -af --volumes + - name: Log in to GitHub Container Registry uses: docker/login-action@v3 with: From 909502e67a23dbb1e9db4c3a9a3a3571dad15a8d Mon Sep 17 00:00:00 2001 From: Asankhaya Sharma Date: Sat, 8 Nov 2025 08:49:57 +0800 Subject: [PATCH 3/4] Bump version to 0.3.6 Co-Authored-By: Claude --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 5a7fd95..6a909e6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "optillm" -version = "0.3.5" +version = "0.3.6" description = "An optimizing inference proxy for LLMs." readme = "README.md" license = "Apache-2.0" From 5030c52a791b05a26e27bc751ece5f56d0e69cf4 Mon Sep 17 00:00:00 2001 From: Asankhaya Sharma Date: Sat, 8 Nov 2025 08:52:31 +0800 Subject: [PATCH 4/4] Update __init__.py --- optillm/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optillm/__init__.py b/optillm/__init__.py index 50b9850..2ba7e5b 100644 --- a/optillm/__init__.py +++ b/optillm/__init__.py @@ -1,5 +1,5 @@ # Version information -__version__ = "0.3.5" +__version__ = "0.3.6" # Import from server module from .server import (