#!/usr/bin/env python3 """weeyuga-benchmarks-public — generic runner. A portable, agent-driven version of the Weeyuga benchmark harness. Mirrors the canonical Pavilion methodology (temperature=0.1, num_ctx=4096, num_predict=2048, single-loaded-model single-parallel-slot) and runs every suite in `harness/suites/` against every model the agent declares, in sequence. One JSONL ledger captures every call. This script is a TEMPLATE, not a one-click button. The friend running this is expected to be working with a coding agent (Claude Code, Codex, Aider, etc.). The agent reads CLAUDE.md / AGENTS.md to learn the methodology, probes the friend's hardware, picks a target model + an OpenAI-compatible runtime (Ollama / llama.cpp / vLLM / MLX / etc.), then adapts this script to the friend's reality before running. Read CLAUDE.md before reading this code. The runner does ONE thing well: drive an OpenAI-compat /v1/chat/completions endpoint with the canonical prompts, write a JSONL ledger of every call, and emit a manifest. Everything else (hardware probe, model pull, runtime tuning, output curation, PR submission) is the agent's job. Usage examples: # smoke (one cell × hello call only — verify the runtime is reachable) python3 harness/run_benchmark.py --smoke \\ --target-url http://127.0.0.1:11434 \\ --models qwen3.5:0.8b \\ --cell-id-prefix mac:ollama \\ --submitter-handle alice \\ --device-tag mac-m1-8gb # the canonical full suite (all phases × all models) python3 harness/run_benchmark.py \\ --target-url http://127.0.0.1:11434 \\ --models qwen3.5:0.8b,qwen3.5:4b \\ --cell-id-prefix mac:ollama \\ --submitter-handle alice \\ --device-tag mac-m1-8gb Required output goes under: submissions///run-/ run.jsonl — canonical event stream (one JSON object per line) manifest.json — who, what, when, where (no PII) hardware.json — what device the run happened on (filled by agent) metadata.json — computed aggregates (filled by --post-process or by hand) run.md — human-readable summary (filled by agent or by `harness/render_summary.py`) """ from __future__ import annotations import argparse import datetime as dt import json import os import platform import re import socket import sys import time import urllib.error import urllib.request import uuid from pathlib import Path from typing import Any HARNESS_VERSION = "public-1" REPO_ROOT = Path(__file__).resolve().parents[1] SUITES_DIR = REPO_ROOT / "harness" / "suites" SUBMISSIONS_DIR = REPO_ROOT / "submissions" DEFAULT_TARGET_URL = os.environ.get( "WEEYUGA_TARGET_URL", "http://127.0.0.1:11434" ) DEFAULT_TIMEOUT_S = 360 # 6-minute hard wall-clock per call HELLO_PROMPT = "hi can you help me?" # Canonical knobs (Sloba's reference values from the Pavilion methodology). # An agent running this on different hardware MAY override via flags or by # editing this file — but the override has to be recorded in manifest.json # `canonical_options_overrides` so the run is honestly comparable. CANONICAL_OPTIONS = { "temperature": 0.1, "num_ctx": 4096, "num_predict": 2048, } # Suite files in run order. ALL of these run, sequentially, per model, per # this run's --phases setting. Don't reorder — comparability across runs # depends on stable ordering. To skip a suite, use --phases. SUITES = { "5q": "small_model_eval_questions.json", "20q": "python_task_suite_questions.json", "parallel_same": "parallel_qwen_same_model_20q_suite.json", "parallel_mixed": "parallel_qwen_mixed_model_20q_suite.json", "edge_append": "python_context_edge_append_questions.json", "edge_suite": "python_context_edge_suite_only.json", } DEFAULT_PHASES = "hello,5q,20q" # the safest "first run" set # ── tiny utilities ────────────────────────────────────────────────── def utc_now() -> str: return dt.datetime.now(dt.timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") def utc_filename_stamp() -> str: return dt.datetime.now(dt.timezone.utc).strftime("%Y%m%dT%H%M%SZ") def load_avg() -> tuple[float, float, float]: try: return os.getloadavg() except (OSError, AttributeError): return (0.0, 0.0, 0.0) def log(message: str) -> None: print(f"{utc_now()} {message}", flush=True) def write_jsonl(handle, record: dict[str, Any]) -> None: handle.write(json.dumps(record, ensure_ascii=False) + "\n") handle.flush() try: os.fsync(handle.fileno()) except (OSError, ValueError): pass # ── HTTP layer ────────────────────────────────────────────────────── def list_models(target_url: str, timeout: int = 15) -> list[str]: """Read /v1/models from the target. Standard OpenAI-compat endpoint.""" req = urllib.request.Request(f"{target_url.rstrip('/')}/v1/models") try: with urllib.request.urlopen(req, timeout=timeout) as resp: body = json.loads(resp.read().decode("utf-8")) except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError, OSError) as exc: log(f"WARN: could not list models on {target_url} ({exc!r}); " "agent must pass --models explicitly.") return [] out: list[str] = [] for item in body.get("data", []): if item.get("status") == "placeholder": continue mid = item.get("id") if mid: out.append(mid) return out def chat_completion( target_url: str, model: str, user_prompt: str, timeout: int = DEFAULT_TIMEOUT_S, max_tokens_override: int | None = None, canonical_options: dict | None = None, ) -> dict[str, Any]: """One non-streaming call. Returns a dict with timing + content + error.""" opts = dict(canonical_options or CANONICAL_OPTIONS) body = { "model": model, "messages": [{"role": "user", "content": user_prompt}], "stream": False, "max_tokens": max_tokens_override or opts["num_predict"], "temperature": opts["temperature"], # Ollama-flavored extras; harmless on llama.cpp / vLLM (ignored). "extra_body": {"options": opts}, } req = urllib.request.Request( f"{target_url.rstrip('/')}/v1/chat/completions", data=json.dumps(body).encode("utf-8"), headers={"Content-Type": "application/json"}, ) started = time.perf_counter() out: dict[str, Any] = { "duration_seconds": None, "response_text": "", "response_chars": 0, "prompt_tokens": None, "completion_tokens": None, "total_tokens": None, "tokens_per_second": None, "finish_reason": None, "status_code": None, "error": None, } try: with urllib.request.urlopen(req, timeout=timeout) as resp: body_bytes = resp.read() out["status_code"] = resp.status elapsed = time.perf_counter() - started out["duration_seconds"] = round(elapsed, 3) payload = json.loads(body_bytes.decode("utf-8")) choices = payload.get("choices") or [] if choices: msg = choices[0].get("message") or {} out["response_text"] = msg.get("content") or "" out["finish_reason"] = choices[0].get("finish_reason") out["response_chars"] = len(out["response_text"]) usage = payload.get("usage") or {} out["prompt_tokens"] = usage.get("prompt_tokens") out["completion_tokens"] = usage.get("completion_tokens") out["total_tokens"] = usage.get("total_tokens") if ( out["completion_tokens"] and out["duration_seconds"] and out["duration_seconds"] > 0 ): out["tokens_per_second"] = round( out["completion_tokens"] / out["duration_seconds"], 2 ) except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError, OSError) as exc: out["duration_seconds"] = round(time.perf_counter() - started, 3) out["error"] = repr(exc) except json.JSONDecodeError as exc: out["duration_seconds"] = round(time.perf_counter() - started, 3) out["error"] = f"JSONDecodeError: {exc}" return out # ── eval helpers (mirrored from canonical Pavilion harness) ───────── def marker_hits(required: list[str], text: str) -> list[str]: lowered = (text or "").lower() return [m for m in required if m.lower() in lowered] def format_ok(rule: str, text: str) -> bool: """Lightweight heuristic format-checker. Mirrors capture_small_model_eval.py.""" stripped = (text or "").strip() lines = [line.strip() for line in stripped.splitlines() if line.strip()] if rule == "bash_code": return stripped.startswith("#!/usr/bin/env bash") if rule == "python_code": return ( "def is_valid_ipv4" in stripped and sum(1 for line in lines if line.startswith("def test_")) >= 3 ) if rule == "shell_lines": return ( all(not line.startswith(("1.", "- ", "* ")) for line in lines[:3]) and "nginx -t" in stripped ) if rule == "four_numbered_steps": return sum(1 for line in lines if re.match(r"^[1-4]\. ", line)) >= 4 if rule == "five_bullets": return sum(1 for line in lines if line.startswith(("- ", "* "))) >= 5 if rule == "json_dict": try: parsed = json.loads(stripped) return isinstance(parsed, dict) except (json.JSONDecodeError, ValueError): return False if rule == "pytest_code": return ( "def test_" in stripped and ("import pytest" in stripped or "pytest" in stripped) ) return False # ── per-call record builder ───────────────────────────────────────── def make_call_record( cell_id: str, model: str, phase: str, question_id: str, prompt: str, run_idx: int, required_markers: list[str], format_rule: str, target_url: str, timeout: int, canonical_options: dict | None = None, ) -> dict[str, Any]: result = chat_completion( target_url, model, prompt, timeout=timeout, canonical_options=canonical_options, ) text = result["response_text"] hits = marker_hits(required_markers, text) return { "type": "call", "ts_utc": utc_now(), "cell_id": cell_id, "model": model, "phase": phase, "question_id": question_id, "run_idx": run_idx, "duration_seconds": result["duration_seconds"], "prompt_tokens": result["prompt_tokens"], "completion_tokens": result["completion_tokens"], "tokens_per_second": result["tokens_per_second"], "finish_reason": result["finish_reason"], "status_code": result["status_code"], "response_chars": result["response_chars"], "response_preview": (text or "")[:240], "required_markers": required_markers, "markers_hit": hits, "marker_hit_rate": ( round(len(hits) / len(required_markers), 3) if required_markers else None ), "format_rule": format_rule, "format_ok": format_ok(format_rule, text) if format_rule else None, "usable_answer": bool((text or "").strip()), "error": result["error"], } # ── phase runners ─────────────────────────────────────────────────── def run_hello(handle, model, target_url, timeout, cell_id_prefix, options): cell_id = f"{cell_id_prefix}:{model}" log(f"[hello] {model}") rec = make_call_record( cell_id=cell_id, model=model, phase="hello", question_id="hello_check", prompt=HELLO_PROMPT, run_idx=0, required_markers=[], format_rule="", target_url=target_url, timeout=timeout, canonical_options=options, ) write_jsonl(handle, rec) log(f" ↳ {rec['duration_seconds']}s " f"completion_tokens={rec['completion_tokens']} " f"finish={rec['finish_reason']} err={rec['error']}") def run_frozen_prompts(handle, model, target_url, timeout, cell_id_prefix, options): """3 frozen prompts (P-EASY/P-MEDIUM/P-HARD) from harness/prompts.py.""" sys.path.insert(0, str(REPO_ROOT / "harness")) from prompts import PROMPTS # type: ignore[import-not-found] cell_id = f"{cell_id_prefix}:{model}" for q_idx, (pid, p) in enumerate(PROMPTS.items()): log(f"[frozen] {model} {pid}") rec = make_call_record( cell_id=cell_id, model=model, phase="frozen", question_id=pid, prompt=p["prompt"], run_idx=q_idx, required_markers=[], format_rule="", target_url=target_url, timeout=timeout, canonical_options=options, ) # Honor per-prompt max_tokens floor rec["max_tokens_used"] = options.get("num_predict") if options else CANONICAL_OPTIONS["num_predict"] write_jsonl(handle, rec) log(f" ↳ {rec['duration_seconds']}s " f"completion_tokens={rec['completion_tokens']} err={rec['error']}") def run_suite(handle, model, target_url, timeout, suite_path, phase, cell_id_prefix, options): """Drive any of the 6 .json suites (5q / 20q / parallel_* / edge_*).""" suite = json.loads(suite_path.read_text(encoding="utf-8")) questions = suite.get("questions") or [] if not questions: log(f"WARN: suite {suite_path.name} has no questions; skipping") return cell_id = f"{cell_id_prefix}:{model}" for q_idx, q in enumerate(questions): log(f"[{phase}] {model} {q.get('id', f'q{q_idx}')} " f"({q_idx + 1}/{len(questions)})") rec = make_call_record( cell_id=cell_id, model=model, phase=phase, question_id=q.get("id", f"q{q_idx}"), prompt=q["prompt"], run_idx=q_idx, required_markers=q.get("required_markers") or [], format_rule=q.get("format_rule") or "", target_url=target_url, timeout=timeout, canonical_options=options, ) write_jsonl(handle, rec) log(f" ↳ {rec['duration_seconds']}s " f"completion_tokens={rec['completion_tokens']} " f"format_ok={rec['format_ok']} markers={rec['marker_hit_rate']} " f"err={rec['error']}") # ── CLI ───────────────────────────────────────────────────────────── def parse_args() -> argparse.Namespace: p = argparse.ArgumentParser( description="Run the Weeyuga benchmark suite against a local model.", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=( "Read CLAUDE.md / AGENTS.md before running. The friend's coding\n" "agent is expected to adapt this script to the friend's hardware\n" "(GPU vs CPU, VRAM constraints, cold-start tuning) and record any\n" "deviations in manifest.json." ), ) p.add_argument("--target-url", default=DEFAULT_TARGET_URL, help="OpenAI-compat base URL (default http://127.0.0.1:11434).") p.add_argument("--models", default="auto", help="Comma-separated model list, or 'auto' to pull from /v1/models.") p.add_argument("--cell-id-prefix", required=False, default="local:ollama", help="Prefix for cell_id. Pattern: :. " "Examples: mac-m1:ollama, predator:llamacpp, vps:cpu.") p.add_argument("--phases", default=DEFAULT_PHASES, help=("Comma-separated subset of: hello, frozen, " + ", ".join(SUITES.keys()) + ". Default: hello,5q,20q. Pass 'all' for the full suite.")) p.add_argument("--timeout", type=int, default=DEFAULT_TIMEOUT_S, help="Per-call wall-clock timeout (default 360).") p.add_argument("--probe", action="store_true", help="Health-check only: list models, do one hello call, exit.") p.add_argument("--smoke", action="store_true", help="One model × hello-only. End-to-end runtime validation.") p.add_argument("--submitter-handle", required=False, default=None, help="Your Gitea (or any) handle. Used in submissions//...") p.add_argument("--device-tag", required=False, default=None, help="Short device tag. Examples: mac-m1-8gb, rtx-4090-pc, " "predator-gtx-1060-6gb. Used in submissions///...") p.add_argument("--run-id", default=None, help="Override auto-generated run UUID (for resuming).") p.add_argument("--out-dir", default=None, help="Override the submissions//// output dir.") p.add_argument("--temperature", type=float, default=None, help="Override canonical temperature 0.1.") p.add_argument("--num-ctx", type=int, default=None, help="Override canonical num_ctx 4096.") p.add_argument("--num-predict", type=int, default=None, help="Override canonical num_predict 2048.") return p.parse_args() def resolve_phases(phases_arg: str) -> list[str]: if phases_arg.strip().lower() == "all": return ["hello", "frozen"] + list(SUITES.keys()) raw = [p.strip() for p in phases_arg.split(",") if p.strip()] out = [] for r in raw: if r in ("hello", "frozen") or r in SUITES: out.append(r) else: raise SystemExit( f"unknown phase {r!r}; valid: hello, frozen, " + ", ".join(SUITES.keys()) + ", or 'all'" ) return out def resolve_models(models_arg: str, target_url: str) -> list[str]: available = list_models(target_url) log(f" /v1/models reports {len(available)}: {available}") if models_arg == "auto": if not available: raise SystemExit( "auto-list found no models on target; pass --models explicitly " "(e.g. --models qwen3.5:0.8b,qwen3.5:4b)." ) return available wanted = [m.strip() for m in models_arg.split(",") if m.strip()] if available: missing = [m for m in wanted if m not in available] if missing: log(f"WARN: requested but not on target: {missing}. " "The runner will try anyway — your runtime may auto-pull.") return wanted def write_manifest(out_dir: Path, args, run_id: str, phases: list[str], models: list[str], options: dict) -> None: manifest = { "schema_version": "manifest-1.0", "run_id": run_id, "harness_version": HARNESS_VERSION, "submitter_handle": args.submitter_handle, "device_tag": args.device_tag, "cell_id_prefix": args.cell_id_prefix, "target_url": args.target_url, "phases_run": phases, "models_run": models, "canonical_options": dict(CANONICAL_OPTIONS), "canonical_options_overrides": { k: v for k, v in { "temperature": args.temperature, "num_ctx": args.num_ctx, "num_predict": args.num_predict, }.items() if v is not None }, "timeout_seconds": args.timeout, "started_at_utc": utc_now(), "host_hostname_short": socket.gethostname().split(".")[0], "platform_system": platform.system(), "platform_release": platform.release(), "python_version": platform.python_version(), } (out_dir / "manifest.json").write_text( json.dumps(manifest, indent=2, ensure_ascii=False) + "\n", encoding="utf-8", ) def main() -> int: args = parse_args() target_url = args.target_url.rstrip("/") log(f"target_url = {target_url}") options = dict(CANONICAL_OPTIONS) if args.temperature is not None: options["temperature"] = args.temperature if args.num_ctx is not None: options["num_ctx"] = args.num_ctx if args.num_predict is not None: options["num_predict"] = args.num_predict log("listing /v1/models on target …") models = resolve_models(args.models, target_url) if args.probe: if not models: log("no models available; abort") return 1 smallest = sorted(models, key=lambda m: ( 0 if "0.5b" in m or "0.6b" in m else 1 if "0.8b" in m else 2 if "1.5b" in m or "1b" in m else 3 if "2b" in m else 4 ))[0] log(f"probe: hello call against smallest = {smallest}") rec = make_call_record( cell_id=f"{args.cell_id_prefix}:{smallest}", model=smallest, phase="probe", question_id="hello_check", prompt=HELLO_PROMPT, run_idx=0, required_markers=[], format_rule="", target_url=target_url, timeout=min(args.timeout, 60), canonical_options=options, ) log(json.dumps(rec, indent=2, ensure_ascii=False)) return 0 if not rec["error"] else 3 if args.smoke: models = [models[0]] if models else [] if not models: log("no models available for smoke; abort") return 1 log(f"smoke: hello-only against {models[0]}") phases = ["hello"] else: phases = resolve_phases(args.phases) if not models: log("no models to run; abort") return 1 if not args.submitter_handle and not args.out_dir: raise SystemExit( "--submitter-handle is required (or pass --out-dir for ad-hoc runs)." ) if not args.device_tag and not args.out_dir: raise SystemExit( "--device-tag is required (or pass --out-dir for ad-hoc runs)." ) run_id = args.run_id or str(uuid.uuid4()) out_dir = ( Path(args.out_dir) if args.out_dir else SUBMISSIONS_DIR / args.submitter_handle / args.device_tag / f"run-{run_id}" ) out_dir.mkdir(parents=True, exist_ok=True) out_path = out_dir / "run.jsonl" log(f"writing JSONL ledger to {out_path}") write_manifest(out_dir, args, run_id, phases, models, options) started_at = utc_now() started_load = load_avg() meta = { "type": "meta", "benchmark_run_id": run_id, "harness_version": HARNESS_VERSION, "started_at_utc": started_at, "host_hostname_short": socket.gethostname().split(".")[0], "load_avg_start": started_load, "target_url": target_url, "cell_id_prefix": args.cell_id_prefix, "submitter_handle": args.submitter_handle, "device_tag": args.device_tag, "execution_shape": "per-model-block", "phases_planned": phases, "models_planned": models, "canonical_options": dict(CANONICAL_OPTIONS), "canonical_options_effective": options, "timeout_seconds": args.timeout, "platform_system": platform.system(), "platform_release": platform.release(), "python_version": platform.python_version(), } with out_path.open("w", encoding="utf-8") as fh: write_jsonl(fh, meta) try: for model in models: log(f"=== model block: {model} ===") for phase in phases: if phase == "hello": run_hello(fh, model, target_url, args.timeout, args.cell_id_prefix, options) elif phase == "frozen": run_frozen_prompts(fh, model, target_url, args.timeout, args.cell_id_prefix, options) else: suite_path = SUITES_DIR / SUITES[phase] if not suite_path.exists(): log(f"WARN: suite missing at {suite_path}; skip") continue run_suite(fh, model, target_url, args.timeout, suite_path, phase, args.cell_id_prefix, options) except KeyboardInterrupt: log("interrupted — partial ledger written") write_jsonl(fh, { "type": "interrupted", "ts_utc": utc_now(), "reason": "KeyboardInterrupt", }) return 130 finally: write_jsonl(fh, { "type": "footer", "ts_utc": utc_now(), "finished_at_utc": utc_now(), "load_avg_end": load_avg(), }) log(f"done — ledger at {out_path}") log(f"next: agent fills hardware.json + run.md, then opens a PR.") return 0 if __name__ == "__main__": raise SystemExit(main())