weeyuga-benchmarks-public/harness/run_benchmark.py

#!/usr/bin/env python3
"""weeyuga-benchmarks-public — generic runner.

A portable, agent-driven version of the Weeyuga benchmark harness. Mirrors
the canonical Pavilion methodology (temperature=0.1, num_ctx=4096,
num_predict=2048, single-loaded-model single-parallel-slot) and runs every
suite in `harness/suites/` against every model the agent declares, in
sequence. One JSONL ledger captures every call.

This script is a TEMPLATE, not a one-click button.

The friend running this is expected to be working with a coding agent (Claude
Code, Codex, Aider, etc.). The agent reads CLAUDE.md / AGENTS.md to learn the
methodology, probes the friend's hardware, picks a target model + an
OpenAI-compatible runtime (Ollama / llama.cpp / vLLM / MLX / etc.), then
adapts this script to the friend's reality before running.

Read CLAUDE.md before reading this code.

The runner does ONE thing well: drive an OpenAI-compat /v1/chat/completions
endpoint with the canonical prompts, write a JSONL ledger of every call, and
emit a manifest. Everything else (hardware probe, model pull, runtime tuning,
output curation, PR submission) is the agent's job.

Usage examples:
  # smoke (one cell × hello call only — verify the runtime is reachable)
  python3 harness/run_benchmark.py --smoke \\
      --target-url http://127.0.0.1:11434 \\
      --models qwen3.5:0.8b \\
      --cell-id-prefix mac:ollama \\
      --submitter-handle alice \\
      --device-tag mac-m1-8gb

  # the canonical full suite (all phases × all models)
  python3 harness/run_benchmark.py \\
      --target-url http://127.0.0.1:11434 \\
      --models qwen3.5:0.8b,qwen3.5:4b \\
      --cell-id-prefix mac:ollama \\
      --submitter-handle alice \\
      --device-tag mac-m1-8gb

Required output goes under:
  submissions/<submitter-handle>/<device-tag>/run-<benchmark-run-id>/
    run.jsonl       — canonical event stream (one JSON object per line)
    manifest.json   — who, what, when, where (no PII)
    hardware.json   — what device the run happened on (filled by agent)
    metadata.json   — computed aggregates (filled by --post-process or by hand)
    run.md          — human-readable summary (filled by agent or by
                      `harness/render_summary.py`)
"""

from __future__ import annotations

import argparse
import datetime as dt
import json
import os
import platform
import re
import socket
import sys
import time
import urllib.error
import urllib.request
import uuid
from pathlib import Path
from typing import Any

HARNESS_VERSION = "public-1"

REPO_ROOT = Path(__file__).resolve().parents[1]
SUITES_DIR = REPO_ROOT / "harness" / "suites"
SUBMISSIONS_DIR = REPO_ROOT / "submissions"

DEFAULT_TARGET_URL = os.environ.get(
    "WEEYUGA_TARGET_URL", "http://127.0.0.1:11434"
)
DEFAULT_TIMEOUT_S = 360  # 6-minute hard wall-clock per call

HELLO_PROMPT = "hi can you help me?"

# Canonical knobs (Sloba's reference values from the Pavilion methodology).
# An agent running this on different hardware MAY override via flags or by
# editing this file — but the override has to be recorded in manifest.json
# `canonical_options_overrides` so the run is honestly comparable.
CANONICAL_OPTIONS = {
    "temperature": 0.1,
    "num_ctx": 4096,
    "num_predict": 2048,
}

# Suite files in run order. ALL of these run, sequentially, per model, per
# this run's --phases setting. Don't reorder — comparability across runs
# depends on stable ordering. To skip a suite, use --phases.
SUITES = {
    "5q":             "small_model_eval_questions.json",
    "20q":            "python_task_suite_questions.json",
    "parallel_same":  "parallel_qwen_same_model_20q_suite.json",
    "parallel_mixed": "parallel_qwen_mixed_model_20q_suite.json",
    "edge_append":    "python_context_edge_append_questions.json",
    "edge_suite":     "python_context_edge_suite_only.json",
}
DEFAULT_PHASES = "hello,5q,20q"  # the safest "first run" set


# ── tiny utilities ──────────────────────────────────────────────────


def utc_now() -> str:
    return dt.datetime.now(dt.timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")


def utc_filename_stamp() -> str:
    return dt.datetime.now(dt.timezone.utc).strftime("%Y%m%dT%H%M%SZ")


def load_avg() -> tuple[float, float, float]:
    try:
        return os.getloadavg()
    except (OSError, AttributeError):
        return (0.0, 0.0, 0.0)


def log(message: str) -> None:
    print(f"{utc_now()} {message}", flush=True)


def write_jsonl(handle, record: dict[str, Any]) -> None:
    handle.write(json.dumps(record, ensure_ascii=False) + "\n")
    handle.flush()
    try:
        os.fsync(handle.fileno())
    except (OSError, ValueError):
        pass


# ── HTTP layer ──────────────────────────────────────────────────────


def list_models(target_url: str, timeout: int = 15) -> list[str]:
    """Read /v1/models from the target. Standard OpenAI-compat endpoint."""
    req = urllib.request.Request(f"{target_url.rstrip('/')}/v1/models")
    try:
        with urllib.request.urlopen(req, timeout=timeout) as resp:
            body = json.loads(resp.read().decode("utf-8"))
    except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError, OSError) as exc:
        log(f"WARN: could not list models on {target_url} ({exc!r}); "
            "agent must pass --models explicitly.")
        return []
    out: list[str] = []
    for item in body.get("data", []):
        if item.get("status") == "placeholder":
            continue
        mid = item.get("id")
        if mid:
            out.append(mid)
    return out


def chat_completion(
    target_url: str,
    model: str,
    user_prompt: str,
    timeout: int = DEFAULT_TIMEOUT_S,
    max_tokens_override: int | None = None,
    canonical_options: dict | None = None,
) -> dict[str, Any]:
    """One non-streaming call. Returns a dict with timing + content + error."""
    opts = dict(canonical_options or CANONICAL_OPTIONS)
    body = {
        "model": model,
        "messages": [{"role": "user", "content": user_prompt}],
        "stream": False,
        "max_tokens": max_tokens_override or opts["num_predict"],
        "temperature": opts["temperature"],
        # Ollama-flavored extras; harmless on llama.cpp / vLLM (ignored).
        "extra_body": {"options": opts},
    }
    req = urllib.request.Request(
        f"{target_url.rstrip('/')}/v1/chat/completions",
        data=json.dumps(body).encode("utf-8"),
        headers={"Content-Type": "application/json"},
    )
    started = time.perf_counter()
    out: dict[str, Any] = {
        "duration_seconds": None,
        "response_text": "",
        "response_chars": 0,
        "prompt_tokens": None,
        "completion_tokens": None,
        "total_tokens": None,
        "tokens_per_second": None,
        "finish_reason": None,
        "status_code": None,
        "error": None,
    }
    try:
        with urllib.request.urlopen(req, timeout=timeout) as resp:
            body_bytes = resp.read()
            out["status_code"] = resp.status
        elapsed = time.perf_counter() - started
        out["duration_seconds"] = round(elapsed, 3)
        payload = json.loads(body_bytes.decode("utf-8"))
        choices = payload.get("choices") or []
        if choices:
            msg = choices[0].get("message") or {}
            out["response_text"] = msg.get("content") or ""
            out["finish_reason"] = choices[0].get("finish_reason")
        out["response_chars"] = len(out["response_text"])
        usage = payload.get("usage") or {}
        out["prompt_tokens"] = usage.get("prompt_tokens")
        out["completion_tokens"] = usage.get("completion_tokens")
        out["total_tokens"] = usage.get("total_tokens")
        if (
            out["completion_tokens"]
            and out["duration_seconds"]
            and out["duration_seconds"] > 0
        ):
            out["tokens_per_second"] = round(
                out["completion_tokens"] / out["duration_seconds"], 2
            )
    except (urllib.error.URLError, urllib.error.HTTPError, TimeoutError, OSError) as exc:
        out["duration_seconds"] = round(time.perf_counter() - started, 3)
        out["error"] = repr(exc)
    except json.JSONDecodeError as exc:
        out["duration_seconds"] = round(time.perf_counter() - started, 3)
        out["error"] = f"JSONDecodeError: {exc}"
    return out


# ── eval helpers (mirrored from canonical Pavilion harness) ─────────


def marker_hits(required: list[str], text: str) -> list[str]:
    lowered = (text or "").lower()
    return [m for m in required if m.lower() in lowered]


def format_ok(rule: str, text: str) -> bool:
    """Lightweight heuristic format-checker. Mirrors capture_small_model_eval.py."""
    stripped = (text or "").strip()
    lines = [line.strip() for line in stripped.splitlines() if line.strip()]
    if rule == "bash_code":
        return stripped.startswith("#!/usr/bin/env bash")
    if rule == "python_code":
        return (
            "def is_valid_ipv4" in stripped
            and sum(1 for line in lines if line.startswith("def test_")) >= 3
        )
    if rule == "shell_lines":
        return (
            all(not line.startswith(("1.", "- ", "* ")) for line in lines[:3])
            and "nginx -t" in stripped
        )
    if rule == "four_numbered_steps":
        return sum(1 for line in lines if re.match(r"^[1-4]\. ", line)) >= 4
    if rule == "five_bullets":
        return sum(1 for line in lines if line.startswith(("- ", "* "))) >= 5
    if rule == "json_dict":
        try:
            parsed = json.loads(stripped)
            return isinstance(parsed, dict)
        except (json.JSONDecodeError, ValueError):
            return False
    if rule == "pytest_code":
        return (
            "def test_" in stripped
            and ("import pytest" in stripped or "pytest" in stripped)
        )
    return False


# ── per-call record builder ─────────────────────────────────────────


def make_call_record(
    cell_id: str,
    model: str,
    phase: str,
    question_id: str,
    prompt: str,
    run_idx: int,
    required_markers: list[str],
    format_rule: str,
    target_url: str,
    timeout: int,
    canonical_options: dict | None = None,
) -> dict[str, Any]:
    result = chat_completion(
        target_url, model, prompt, timeout=timeout,
        canonical_options=canonical_options,
    )
    text = result["response_text"]
    hits = marker_hits(required_markers, text)
    return {
        "type": "call",
        "ts_utc": utc_now(),
        "cell_id": cell_id,
        "model": model,
        "phase": phase,
        "question_id": question_id,
        "run_idx": run_idx,
        "duration_seconds": result["duration_seconds"],
        "prompt_tokens": result["prompt_tokens"],
        "completion_tokens": result["completion_tokens"],
        "tokens_per_second": result["tokens_per_second"],
        "finish_reason": result["finish_reason"],
        "status_code": result["status_code"],
        "response_chars": result["response_chars"],
        "response_preview": (text or "")[:240],
        "required_markers": required_markers,
        "markers_hit": hits,
        "marker_hit_rate": (
            round(len(hits) / len(required_markers), 3)
            if required_markers
            else None
        ),
        "format_rule": format_rule,
        "format_ok": format_ok(format_rule, text) if format_rule else None,
        "usable_answer": bool((text or "").strip()),
        "error": result["error"],
    }


# ── phase runners ───────────────────────────────────────────────────


def run_hello(handle, model, target_url, timeout, cell_id_prefix, options):
    cell_id = f"{cell_id_prefix}:{model}"
    log(f"[hello] {model}")
    rec = make_call_record(
        cell_id=cell_id, model=model, phase="hello",
        question_id="hello_check", prompt=HELLO_PROMPT, run_idx=0,
        required_markers=[], format_rule="",
        target_url=target_url, timeout=timeout,
        canonical_options=options,
    )
    write_jsonl(handle, rec)
    log(f"  ↳ {rec['duration_seconds']}s "
        f"completion_tokens={rec['completion_tokens']} "
        f"finish={rec['finish_reason']} err={rec['error']}")


def run_frozen_prompts(handle, model, target_url, timeout, cell_id_prefix, options):
    """3 frozen prompts (P-EASY/P-MEDIUM/P-HARD) from harness/prompts.py."""
    sys.path.insert(0, str(REPO_ROOT / "harness"))
    from prompts import PROMPTS  # type: ignore[import-not-found]
    cell_id = f"{cell_id_prefix}:{model}"
    for q_idx, (pid, p) in enumerate(PROMPTS.items()):
        log(f"[frozen] {model} {pid}")
        rec = make_call_record(
            cell_id=cell_id, model=model, phase="frozen",
            question_id=pid, prompt=p["prompt"], run_idx=q_idx,
            required_markers=[], format_rule="",
            target_url=target_url, timeout=timeout,
            canonical_options=options,
        )
        # Honor per-prompt max_tokens floor
        rec["max_tokens_used"] = options.get("num_predict") if options else CANONICAL_OPTIONS["num_predict"]
        write_jsonl(handle, rec)
        log(f"  ↳ {rec['duration_seconds']}s "
            f"completion_tokens={rec['completion_tokens']} err={rec['error']}")


def run_suite(handle, model, target_url, timeout, suite_path, phase, cell_id_prefix, options):
    """Drive any of the 6 .json suites (5q / 20q / parallel_* / edge_*)."""
    suite = json.loads(suite_path.read_text(encoding="utf-8"))
    questions = suite.get("questions") or []
    if not questions:
        log(f"WARN: suite {suite_path.name} has no questions; skipping")
        return
    cell_id = f"{cell_id_prefix}:{model}"
    for q_idx, q in enumerate(questions):
        log(f"[{phase}] {model} {q.get('id', f'q{q_idx}')} "
            f"({q_idx + 1}/{len(questions)})")
        rec = make_call_record(
            cell_id=cell_id, model=model, phase=phase,
            question_id=q.get("id", f"q{q_idx}"), prompt=q["prompt"],
            run_idx=q_idx,
            required_markers=q.get("required_markers") or [],
            format_rule=q.get("format_rule") or "",
            target_url=target_url, timeout=timeout,
            canonical_options=options,
        )
        write_jsonl(handle, rec)
        log(f"  ↳ {rec['duration_seconds']}s "
            f"completion_tokens={rec['completion_tokens']} "
            f"format_ok={rec['format_ok']} markers={rec['marker_hit_rate']} "
            f"err={rec['error']}")


# ── CLI ─────────────────────────────────────────────────────────────


def parse_args() -> argparse.Namespace:
    p = argparse.ArgumentParser(
        description="Run the Weeyuga benchmark suite against a local model.",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=(
            "Read CLAUDE.md / AGENTS.md before running. The friend's coding\n"
            "agent is expected to adapt this script to the friend's hardware\n"
            "(GPU vs CPU, VRAM constraints, cold-start tuning) and record any\n"
            "deviations in manifest.json."
        ),
    )
    p.add_argument("--target-url", default=DEFAULT_TARGET_URL,
        help="OpenAI-compat base URL (default http://127.0.0.1:11434).")
    p.add_argument("--models", default="auto",
        help="Comma-separated model list, or 'auto' to pull from /v1/models.")
    p.add_argument("--cell-id-prefix", required=False, default="local:ollama",
        help="Prefix for cell_id. Pattern: <node-tag>:<engine>. "
             "Examples: mac-m1:ollama, predator:llamacpp, vps:cpu.")
    p.add_argument("--phases", default=DEFAULT_PHASES,
        help=("Comma-separated subset of: hello, frozen, "
              + ", ".join(SUITES.keys())
              + ". Default: hello,5q,20q. Pass 'all' for the full suite."))
    p.add_argument("--timeout", type=int, default=DEFAULT_TIMEOUT_S,
        help="Per-call wall-clock timeout (default 360).")
    p.add_argument("--probe", action="store_true",
        help="Health-check only: list models, do one hello call, exit.")
    p.add_argument("--smoke", action="store_true",
        help="One model × hello-only. End-to-end runtime validation.")
    p.add_argument("--submitter-handle", required=False, default=None,
        help="Your Gitea (or any) handle. Used in submissions/<handle>/...")
    p.add_argument("--device-tag", required=False, default=None,
        help="Short device tag. Examples: mac-m1-8gb, rtx-4090-pc, "
             "predator-gtx-1060-6gb. Used in submissions/<handle>/<tag>/...")
    p.add_argument("--run-id", default=None,
        help="Override auto-generated run UUID (for resuming).")
    p.add_argument("--out-dir", default=None,
        help="Override the submissions/<handle>/<tag>/<run-id>/ output dir.")
    p.add_argument("--temperature", type=float, default=None,
        help="Override canonical temperature 0.1.")
    p.add_argument("--num-ctx", type=int, default=None,
        help="Override canonical num_ctx 4096.")
    p.add_argument("--num-predict", type=int, default=None,
        help="Override canonical num_predict 2048.")
    return p.parse_args()


def resolve_phases(phases_arg: str) -> list[str]:
    if phases_arg.strip().lower() == "all":
        return ["hello", "frozen"] + list(SUITES.keys())
    raw = [p.strip() for p in phases_arg.split(",") if p.strip()]
    out = []
    for r in raw:
        if r in ("hello", "frozen") or r in SUITES:
            out.append(r)
        else:
            raise SystemExit(
                f"unknown phase {r!r}; valid: hello, frozen, "
                + ", ".join(SUITES.keys()) + ", or 'all'"
            )
    return out


def resolve_models(models_arg: str, target_url: str) -> list[str]:
    available = list_models(target_url)
    log(f"  /v1/models reports {len(available)}: {available}")
    if models_arg == "auto":
        if not available:
            raise SystemExit(
                "auto-list found no models on target; pass --models explicitly "
                "(e.g. --models qwen3.5:0.8b,qwen3.5:4b)."
            )
        return available
    wanted = [m.strip() for m in models_arg.split(",") if m.strip()]
    if available:
        missing = [m for m in wanted if m not in available]
        if missing:
            log(f"WARN: requested but not on target: {missing}. "
                "The runner will try anyway — your runtime may auto-pull.")
    return wanted


def write_manifest(out_dir: Path, args, run_id: str, phases: list[str], models: list[str], options: dict) -> None:
    manifest = {
        "schema_version": "manifest-1.0",
        "run_id": run_id,
        "harness_version": HARNESS_VERSION,
        "submitter_handle": args.submitter_handle,
        "device_tag": args.device_tag,
        "cell_id_prefix": args.cell_id_prefix,
        "target_url": args.target_url,
        "phases_run": phases,
        "models_run": models,
        "canonical_options": dict(CANONICAL_OPTIONS),
        "canonical_options_overrides": {
            k: v for k, v in {
                "temperature": args.temperature,
                "num_ctx": args.num_ctx,
                "num_predict": args.num_predict,
            }.items() if v is not None
        },
        "timeout_seconds": args.timeout,
        "started_at_utc": utc_now(),
        "host_hostname_short": socket.gethostname().split(".")[0],
        "platform_system": platform.system(),
        "platform_release": platform.release(),
        "python_version": platform.python_version(),
    }
    (out_dir / "manifest.json").write_text(
        json.dumps(manifest, indent=2, ensure_ascii=False) + "\n",
        encoding="utf-8",
    )


def main() -> int:
    args = parse_args()

    target_url = args.target_url.rstrip("/")
    log(f"target_url = {target_url}")

    options = dict(CANONICAL_OPTIONS)
    if args.temperature is not None: options["temperature"] = args.temperature
    if args.num_ctx is not None:     options["num_ctx"] = args.num_ctx
    if args.num_predict is not None: options["num_predict"] = args.num_predict

    log("listing /v1/models on target …")
    models = resolve_models(args.models, target_url)

    if args.probe:
        if not models:
            log("no models available; abort")
            return 1
        smallest = sorted(models, key=lambda m: (
            0 if "0.5b" in m or "0.6b" in m
            else 1 if "0.8b" in m
            else 2 if "1.5b" in m or "1b" in m
            else 3 if "2b" in m
            else 4
        ))[0]
        log(f"probe: hello call against smallest = {smallest}")
        rec = make_call_record(
            cell_id=f"{args.cell_id_prefix}:{smallest}",
            model=smallest, phase="probe",
            question_id="hello_check", prompt=HELLO_PROMPT, run_idx=0,
            required_markers=[], format_rule="",
            target_url=target_url, timeout=min(args.timeout, 60),
            canonical_options=options,
        )
        log(json.dumps(rec, indent=2, ensure_ascii=False))
        return 0 if not rec["error"] else 3

    if args.smoke:
        models = [models[0]] if models else []
        if not models:
            log("no models available for smoke; abort")
            return 1
        log(f"smoke: hello-only against {models[0]}")
        phases = ["hello"]
    else:
        phases = resolve_phases(args.phases)

    if not models:
        log("no models to run; abort")
        return 1

    if not args.submitter_handle and not args.out_dir:
        raise SystemExit(
            "--submitter-handle is required (or pass --out-dir for ad-hoc runs)."
        )
    if not args.device_tag and not args.out_dir:
        raise SystemExit(
            "--device-tag is required (or pass --out-dir for ad-hoc runs)."
        )

    run_id = args.run_id or str(uuid.uuid4())
    out_dir = (
        Path(args.out_dir) if args.out_dir
        else SUBMISSIONS_DIR / args.submitter_handle / args.device_tag / f"run-{run_id}"
    )
    out_dir.mkdir(parents=True, exist_ok=True)
    out_path = out_dir / "run.jsonl"
    log(f"writing JSONL ledger to {out_path}")

    write_manifest(out_dir, args, run_id, phases, models, options)

    started_at = utc_now()
    started_load = load_avg()

    meta = {
        "type": "meta",
        "benchmark_run_id": run_id,
        "harness_version": HARNESS_VERSION,
        "started_at_utc": started_at,
        "host_hostname_short": socket.gethostname().split(".")[0],
        "load_avg_start": started_load,
        "target_url": target_url,
        "cell_id_prefix": args.cell_id_prefix,
        "submitter_handle": args.submitter_handle,
        "device_tag": args.device_tag,
        "execution_shape": "per-model-block",
        "phases_planned": phases,
        "models_planned": models,
        "canonical_options": dict(CANONICAL_OPTIONS),
        "canonical_options_effective": options,
        "timeout_seconds": args.timeout,
        "platform_system": platform.system(),
        "platform_release": platform.release(),
        "python_version": platform.python_version(),
    }

    with out_path.open("w", encoding="utf-8") as fh:
        write_jsonl(fh, meta)
        try:
            for model in models:
                log(f"=== model block: {model} ===")
                for phase in phases:
                    if phase == "hello":
                        run_hello(fh, model, target_url, args.timeout,
                                  args.cell_id_prefix, options)
                    elif phase == "frozen":
                        run_frozen_prompts(fh, model, target_url, args.timeout,
                                           args.cell_id_prefix, options)
                    else:
                        suite_path = SUITES_DIR / SUITES[phase]
                        if not suite_path.exists():
                            log(f"WARN: suite missing at {suite_path}; skip")
                            continue
                        run_suite(fh, model, target_url, args.timeout,
                                  suite_path, phase, args.cell_id_prefix, options)
        except KeyboardInterrupt:
            log("interrupted — partial ledger written")
            write_jsonl(fh, {
                "type": "interrupted",
                "ts_utc": utc_now(),
                "reason": "KeyboardInterrupt",
            })
            return 130
        finally:
            write_jsonl(fh, {
                "type": "footer",
                "ts_utc": utc_now(),
                "finished_at_utc": utc_now(),
                "load_avg_end": load_avg(),
            })

    log(f"done — ledger at {out_path}")
    log(f"next: agent fills hardware.json + run.md, then opens a PR.")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())