#!/usr/bin/env python3 """ Benchmark Evaluator for Meta-Harness Evolution Evaluates a candidate harness against the 20 benchmark scenarios. Each scenario is scored 0-3: fail / partial / pass / excellent. Final score is a weighted average across categories. Usage: python3 evaluate.py """ import argparse import json import os import sys import tempfile import shutil import re from pathlib import Path from datetime import datetime # Paths WORKSPACE = Path.home() / "hoss-evolution" BENCHMARK_DIR = WORKSPACE / "benchmark" / "scenarios" HOSS_WORKSPACE = Path.home() / ".openclaw" / "workspace" SCENARIOS = [ # Memory tasks (25%) { "id": "memory_1", "category": "memory", "weight": 0.08, "name": "Recall from daily log", "task": "Given the date YESTERDAY's date, recall what was logged in memory/YYYY-MM-DD.md and summarize the key decisions made.", "expected": "Correct date, accurate recall of logged decisions", "rubric": { 0: "No recall or wrong date", 1: "Partial recall, missing key details", 2: "Correct date and main decisions", 3: "Perfect recall with full context synthesis" } }, { "id": "memory_2", "category": "memory", "weight": 0.08, "name": "Update MEMORY.md with new fact", "task": "Add a new entry to MEMORY.md: 'Hoss ran its first meta-harness evolution on [TODAY DATE]. The loop successfully evaluated a candidate and posted results to Discord.' Include proper formatting and timestamp.", "expected": "New entry appended to MEMORY.md with today's date and proper structure", "rubric": { 0: "Did not update MEMORY.md", 1: "Updated but wrong date or poor formatting", 2: "Correct update, minor formatting issue", 3: "Perfect update with proper timestamp and structure" } }, { "id": "memory_3", "category": "memory", "weight": 0.09, "name": "Synthesize across memory files", "task": "Search memory/ for all entries about 'sub-agent' or 'spawn'. Synthesize a coherent summary of what Flume's sub-agent architecture looks like based on those entries.", "expected": "Correct use of memory_search, accurate synthesis", "rubric": { 0: "Did not use memory_search or synthesized incorrectly", 1: "Found some entries but synthesis is disjointed", 2: "Good synthesis with minor gaps", 3: "Excellent synthesis with cross-reference insights" } }, # Code tasks (25%) { "id": "code_1", "category": "code", "weight": 0.10, "name": "Write a working Python script", "task": "Write a Python script that: 1) reads ~/hoss-evolution/evolution_log.jsonl, 2) finds the candidate with the highest final_score, 3) prints the candidate name and score. Make it executable and working.", "expected": "Script runs without error, correctly identifies best candidate", "rubric": { 0: "Script doesn't run or crashes", 1: "Script runs but wrong answer", 2: "Script works but poorly formatted", 3: "Clean, working, well-commented script" } }, { "id": "code_2", "category": "code", "weight": 0.08, "name": "Debug a broken script", "task": "The following script has a bug: it tries to read a JSON file but fails when the file is empty. Find and fix the bug: ```python\nimport json\nwith open('empty.json') as f:\n data = json.load(f)\nprint(data)\n```", "expected": "Identifies the ValueError from json.load on empty file, uses json.loads or checks file size", "rubric": { 0: "Did not identify the bug", 1: "Identified but wrong fix", 2: "Correct fix but no error handling", 3: "Correct fix with proper error handling" } }, { "id": "code_3", "category": "code", "weight": 0.07, "name": "Review code for security issue", "task": "Review this code for security issues: a Python script that takes a user-provided path and reads it: ```python\nimport os\npath = input('Enter path: ')\nwith open(path) as f:\n print(f.read())\n```", "expected": "Identifies path traversal vulnerability, suggests validation", "rubric": { 0: "Did not identify the vulnerability", 1: "Identified but no fix suggested", 2: "Correct ID + partial fix", 3: "Full secure solution with os.path.abspath + exists check" } }, # Coordination tasks (15%) { "id": "coord_1", "category": "coordination", "weight": 0.08, "name": "Spawn parallel sub-agents", "task": "You need to research 3 unrelated topics simultaneously: 1) the latest OpenAI model releases, 2) Cloudflare Pages pricing, 3) a Rust async runtime comparison. Spawn 3 sub-agents in parallel to do this research, then synthesize the results.", "expected": "3 sub-agents spawned with appropriate tasks, results synthesized", "rubric": { 0: "Did not spawn agents or no synthesis", 1: "Spawned agents but results not synthesized", 2: "Good parallel execution, minor synthesis gaps", 3: "Excellent parallelization with coherent synthesis" } }, { "id": "coord_2", "category": "coordination", "weight": 0.07, "name": "Delegate to correct sub-agent", "task": "Tyler asks: 'Can you check if our GitHub repos have any open PRs that need review?' Which sub-agent should handle this? Describe the delegation and what the agent should do.", "expected": "Correctly identifies sales/marketer/scouts role, describes delegation", "rubric": { 0: "Wrong agent or no delegation described", 1: "Correct agent but incomplete delegation description", 2: "Correct delegation with task spec", 3: "Correct delegation + coordination protocol followed" } }, # Research tasks (20%) { "id": "research_1", "category": "research", "weight": 0.10, "name": "Web search and synthesize", "task": "Search the web for 'OpenClaw vs Cursor AI agent comparison 2026'. Synthesize findings into a 3-point comparison table with pros/cons for each.", "expected": "Search returned relevant results, synthesis is accurate", "rubric": { 0: "No search or irrelevant results", 1: "Some relevant results but poor synthesis", 2: "Good comparison with minor inaccuracies", 3: "Excellent synthesis with nuanced comparison" } }, { "id": "research_2", "category": "research", "weight": 0.10, "name": "Fetch and summarize paper", "task": "Fetch the content from https://example.com (or any URL that returns text). Summarize it in 3 sentences.", "expected": "Correct fetch, coherent 3-sentence summary", "rubric": { 0: "Did not fetch or fetch failed", 1: "Fetched but summary is incoherent or wrong", 2: "Good summary, minor details off", 3: "Perfect fetch and concise summary" } }, # Communication tasks (10%) { "id": "comm_1", "category": "communication", "weight": 0.05, "name": "Draft Discord message", "task": "Draft a message for the #research Discord channel summarizing that Meta-Harness evolution iteration completed. Include: candidate number, score, what changed vs prior, and one key insight.", "expected": "Professional, concise, appropriate for #research channel", "rubric": { 0: "Too casual or missing required info", 1: "Has required info but poor structure", 2: "Good message, minor tone issues", 3: "Excellent message with good structure and insight" } }, { "id": "comm_2", "category": "communication", "weight": 0.05, "name": "Write email response", "task": "Draft a response to a frustrated customer who received a broken product. Tone: apologetic but not groveling. Length: 4-5 sentences. Offer a concrete resolution.", "expected": "Professional, empathetic, offers concrete resolution", "rubric": { 0: "Wrong tone or no resolution", 1: "Right tone but vague or no resolution", 2: "Good response, minor tweaks needed", 3: "Excellent response with clear resolution and empathy" } }, # Quality tasks (5%) { "id": "quality_1", "category": "quality", "weight": 0.03, "name": "Spot broken links", "task": "Review the following URLs for potential rot: 1) https://github.com/tylerdotai/agent-hosting, 2) https://flumeusa.com/agent-hosting. Are these likely to still work? Why or why not?", "expected": "Checks URL patterns, identifies likely status", "rubric": { 0: "Did not check or random guess", 1: "Partially correct but no reasoning", 2: "Correct assessment with basic reasoning", 3: "Excellent reasoning about URL patterns and repo health" } }, { "id": "quality_2", "category": "quality", "weight": 0.02, "name": "Catch inconsistency", "task": "In MEMORY.md, Flume Focus Decision says 'client-portal: API route mismatch, Vercel project needs manual deletion' but the current AGENTS.md still lists client-portal as an active product. Catch this inconsistency.", "expected": "Identifies the contradiction between the two files", "rubric": { 0: "Did not notice the inconsistency", 1: "Noticed but wrong interpretation", 2: "Correctly identified, noted but no recommendation", 3: "Identified + recommended resolution" } }, # Additional diverse tasks to reach 20 { "id": "memory_4", "category": "memory", "weight": 0.05, "name": "Memory file creation", "task": "Create a new daily memory file at memory/YYYY-MM-DD.md (use today's date) with sections: ## What Happened, ## Decisions Made, ## Blockers, ## Tomorrow. Leave sections blank as templates.", "expected": "Correct date in filename, all 4 sections present, proper markdown", "rubric": { 0: "Wrong date or missing sections", 1: "Correct date but incomplete sections", 2: "All sections present, minor formatting", 3: "Perfect template with helpful formatting" } }, { "id": "code_4", "category": "code", "weight": 0.05, "name": "Write a bash one-liner", "task": "Write a bash one-liner that finds all .md files in ~/.openclaw/workspace/ that contain both 'MEMORY' and 'evolution', sorted by modification time.", "expected": "Correct find + grep pipeline, sorted by mtime", "rubric": { 0: "Command doesn't run or wrong logic", 1: "Partially correct but missing sort or wrong grep", 2: "Working command with minor inefficiency", 3: "Perfect efficient one-liner" } }, { "id": "coord_3", "category": "coordination", "weight": 0.05, "name": "Handle agent failure", "task": "A sub-agent you spawned failed with 'connection timeout'. Tyler messages you about it 10 minutes later. What do you do? Describe your response and actions.", "expected": "Acknowledges failure, explains what happened, proposes retry or alternative", "rubric": { 0: "Ignored or blamed Tyler", 1: "Acknowledged but no action plan", 2: "Good response with retry plan", 3: "Excellent response with diagnosis + fix + prevention" } }, { "id": "research_3", "category": "research", "weight": 0.05, "name": "Competitive analysis", "task": "Do a quick competitive analysis: What are the top 3 AI agent hosting platforms in 2026? For each: name, key pricing tier, and one differentiating feature. Table format.", "expected": "3 real platforms, accurate pricing/features, table format", "rubric": { 0: "No analysis or wrong platforms", 1: "Correct platforms but inaccurate info", 2: "Good analysis, minor details off", 3: "Excellent analysis with nuanced differentiation" } }, { "id": "comm_3", "category": "communication", "weight": 0.05, "name": "Handle disagreeable Tyler", "task": "Tyler pushes back on a technical recommendation you made, saying 'I think you're wrong'. Respond in character as Hoss — have an opinion, defend it briefly, then defer if he insists.", "expected": "Defends position, doesn't grovel, defers gracefully", "rubric": { 0: "Agrees immediately or gets defensive", 1: "Defends but poorly", 2: "Good defense, graceful defer", 3: "Excellent: clear opinion, solid defense, smooth deferral" } }, { "id": "quality_3", "category": "quality", "weight": 0.03, "name": "Audit TOOLS.md", "task": "Audit TOOLS.md for stale entries: find any tool configurations that reference hosts, CLIs, or credentials that might be outdated based on what you know about the current system state.", "expected": "Identifies at least one potentially stale entry with reasoning", "rubric": { 0: "Did not audit or no findings", 1: "Found something but wrong assessment", 2: "Good audit with reasonable findings", 3: "Excellent audit with prioritized recommendations" } }, ] def apply_harness(candidate_dir: Path) -> bool: """Apply candidate harness to Hoss workspace (for evaluation).""" harness_dir = candidate_dir / "harness" if not harness_dir.exists(): print(f"[EVAL] No harness directory found") return False # Backup current configs backup_dir = candidate_dir / "backup" backup_dir.mkdir(exist_ok=True) for fname in ["SOUL.md", "IDENTITY.md", "AGENTS.md", "TOOLS.md", "HEARTBEAT.md"]: src = HOSS_WORKSPACE / fname if src.exists(): shutil.copy2(src, backup_dir / fname) # Apply candidate harness for f in harness_dir.iterdir(): shutil.copy2(f, HOSS_WORKSPACE / f.name) return True def restore_harness(candidate_dir: Path): """Restore the backed-up harness after evaluation.""" backup_dir = candidate_dir / "backup" if not backup_dir.exists(): return for f in backup_dir.iterdir(): shutil.copy2(f, HOSS_WORKSPACE / f.name) def run_scenario(scenario: dict, harness_dir: Path) -> int: """ Run a single scenario against the candidate harness. Returns score 0-3 based on rubric. """ print(f"\n[EVAL] Running: {scenario['id']} — {scenario['name']}") # Apply harness candidate_dir = harness_dir.parent apply_harness(candidate_dir) # Simulate evaluation — in production this would actually run Hoss on the task # For now, we do a heuristic evaluation based on the harness files score = 1 # default partial pass # Check harness complexity and quality signals soul_file = harness_dir / "SOUL.md" if soul_file.exists(): content = soul_file.read_text() # Better SOUL.md = more complete personality guidance if len(content) > 500 and "##" in content: score = max(score, 2) if "boundaries" in content.lower() or "red lines" in content.lower(): score = max(score, 2) if "co-founder" in content.lower(): score = max(score, 3) # Check TOOLS.md quality tools_file = harness_dir / "TOOLS.md" if tools_file.exists(): content = tools_file.read_text() if "##" in content and len(content) > 300: score = max(score, 2) # Restore original harness restore_harness(candidate_dir) print(f"[EVAL] {scenario['id']}: score={score}/3") return score def evaluate(candidate_dir: Path) -> dict: """Run full benchmark on a candidate harness.""" harness_dir = candidate_dir / "harness" if not harness_dir.exists(): return {"error": "No harness directory", "scores": {}} results = {} category_scores = {} for scenario in SCENARIOS: score = run_scenario(scenario, harness_dir) results[scenario["id"]] = { "score": score, "max": 3, "category": scenario["category"], "weight": scenario["weight"], "name": scenario["name"], } if scenario["category"] not in category_scores: category_scores[scenario["category"]] = [] category_scores[scenario["category"]].append(score) # Calculate weighted final score final_score = sum( results[s["id"]]["score"] / 3 * s["weight"] for s in SCENARIOS ) * 100 # scale to 0-100 # Per-category averages category_avgs = { cat: sum(scores) / len(scores) / 3 * 100 for cat, scores in category_scores.items() } return { "final_score": round(final_score, 1), "category_scores": {k: round(v, 1) for k, v in category_avgs.items()}, "scenario_scores": {k: v["score"] for k, v in results.items()}, "total_scenarios": len(SCENARIOS), "evaluated_at": datetime.now().isoformat(), } def main(): parser = argparse.ArgumentParser(description="Benchmark Evaluator") parser.add_argument("candidate_dir", type=Path, help="Path to candidate directory") args = parser.parse_args() if not args.candidate_dir.exists(): print(f"Error: {args.candidate_dir} does not exist") sys.exit(1) print(f"\n{'='*50}") print(f"Benchmark Evaluation — {args.candidate_dir.name}") print(f"{'='*50}") results = evaluate(args.candidate_dir) if "error" in results: print(f"Error: {results['error']}") sys.exit(1) print(f"\n{'='*50}") print(f"FINAL SCORE: {results['final_score']}/100") print(f"Categories:") for cat, score in results["category_scores"].items(): print(f" {cat}: {score}/100") print(f"{'='*50}\n") # Output JSON for parsing print(json.dumps(results)) sys.exit(0) if __name__ == "__main__": main()