{
  "suite_name": "overnight-python-telemetry-v2-real-context",
  "version": "2.0",
  "purpose": "A deterministic overnight suite for evaluating big and small Ollama models on vps50 with harder multi-file prompts shaped after Slobodan's real implementation, review, debugging, and orchestration asks.",
  "models": [
    {
      "model": "qwen32-coder-32k",
      "display_name": "Qwen32 Coder 32k",
      "size_label": "32b"
    },
    {
      "model": "qwen14-coder-32k",
      "display_name": "Qwen14 Coder 32k",
      "size_label": "14b"
    },
    {
      "model": "codestral-32k",
      "display_name": "Codestral 32k",
      "size_label": "22b"
    },
    {
      "model": "codellama34-16k",
      "display_name": "CodeLlama 34 16k",
      "size_label": "34b"
    },
    {
      "model": "phind34-16k",
      "display_name": "Phind 34 16k",
      "size_label": "34b"
    },
    {
      "model": "qwen14-general-32k",
      "display_name": "Qwen14 General 32k",
      "size_label": "14b"
    },
    {
      "model": "qwen2.5-coder:3b",
      "display_name": "Qwen2.5 Coder 3B",
      "size_label": "3b"
    },
    {
      "model": "qwen2.5-coder:1.5b",
      "display_name": "Qwen2.5 Coder 1.5B",
      "size_label": "1.5b"
    },
    {
      "model": "qwen2.5:3b",
      "display_name": "Qwen2.5 3B",
      "size_label": "3b"
    },
    {
      "model": "llama3.2:3b",
      "display_name": "Llama 3.2 3B",
      "size_label": "3b"
    },
    {
      "model": "phi3",
      "display_name": "Phi-3 Mini",
      "size_label": "3.8b"
    }
  ],
  "questions": [
    {
      "id": "myboard_auth_redirect_triage",
      "title": "MyBoard Auth Redirect Triage",
      "category": "debugging",
      "format_rule": "json_dict",
      "num_predict": 700,
      "context_files": [
        "MyBoard/app/api.py",
        "MyBoard/web/nuxt-app/app/composables/useSession.ts",
        "MyBoard/web/nuxt-app/app/middleware/auth.global.ts",
        "MyBoard/web/nuxt-app/app/pages/login.vue",
        "MyBoard/tests/browser/flow-coverage-manifest.json",
        "MyBoard/docs/user flows/access-onboarding-and-account-flows.md"
      ],
      "prompt": "Return only JSON. You are debugging a real MyBoard issue where password login can succeed but the user still lands on the wrong route or loses tenant context.\nTask: produce a repo-grounded triage packet.\nContext files:\n1. MyBoard/app/api.py: exposes /auth/login and /auth/me.\n2. MyBoard/web/nuxt-app/app/composables/useSession.ts: loginWithPassword(), setSession(), fetchContext(), applyTenant(), applyProject().\n3. MyBoard/web/nuxt-app/app/middleware/auth.global.ts: redirects to /login, /tenant-missing, /403 and reads myboard:post-login-redirect.\n4. MyBoard/web/nuxt-app/app/pages/login.vue: saveRedirectTarget(), redirectAfterLogin(), resolveOidcErrorMessage().\n5. MyBoard/tests/browser/flow-coverage-manifest.json: login and tenant-missing flows are acceptance coverage.\n6. MyBoard/docs/user flows/access-onboarding-and-account-flows.md: expected post-login workspace behavior.\nOutput keys exactly: issue_summary, likely_root_causes, backend_touchpoints, frontend_touchpoints, tests_to_run, safe_fix_plan.\nConstraints: mention /auth/login, /auth/me, myboard:post-login-redirect, tenant-missing, X-Tenant, and do not invent files outside the context list.",
      "required_markers": [
        "/auth/login",
        "/auth/me",
        "useSession.ts",
        "auth.global.ts",
        "login.vue",
        "tenant-missing",
        "myboard:post-login-redirect",
        "X-Tenant"
      ]
    },
    {
      "id": "myboard_board_snapshot_regression_test",
      "title": "Board Snapshot Regression Test",
      "category": "tests",
      "format_rule": "pytest_code",
      "num_predict": 900,
      "context_files": [
        "MyBoard/app/api.py",
        "MyBoard/app/models.py",
        "MyBoard/tests/api/test_board_snapshot.py",
        "MyBoard/tests/api/test_task_bulk_jobs.py",
        "MyBoard/web/nuxt-app/app/composables/queries/boards.ts",
        "MyBoard/web/nuxt-app/app/pages/board/[[projectSlug]].vue"
      ],
      "prompt": "Return only Python code. Write one focused pytest module for a real MyBoard regression around /board/snapshot after bulk task assignment and lane movement.\nContext files:\n1. MyBoard/app/api.py: exposes /board/snapshot, /tasks/{task_id}, and bulk job endpoints.\n2. MyBoard/app/models.py: workflow statuses and lane ordering are defined here.\n3. MyBoard/tests/api/test_board_snapshot.py: current lane-count and project-scope coverage.\n4. MyBoard/tests/api/test_task_bulk_jobs.py: helper patterns for creating stories/tasks and checking snapshot sync after assign/move.\n5. MyBoard/web/nuxt-app/app/composables/queries/boards.ts: frontend expects board data to stay lane-consistent.\n6. MyBoard/web/nuxt-app/app/pages/board/[[projectSlug]].vue: board page consumes the snapshot.\nRequirements: include async auth helper, create at least one story and one task, move the task to session, verify /board/snapshot returns the task in the session lane, and assert assignee_ids survive the move. Do not invent endpoints outside the context list.",
      "required_markers": [
        "def test_",
        "/board/snapshot",
        "/tasks/",
        "session",
        "assignee_ids",
        "story_id",
        "api_client"
      ]
    },
    {
      "id": "myboard_lane_config_patch_plan",
      "title": "Lane Config Patch Plan",
      "category": "planning",
      "format_rule": "json_dict",
      "num_predict": 700,
      "context_files": [
        "MyBoard/app/models.py",
        "MyBoard/app/api.py",
        "MyBoard/tests/api/test_lane_config.py",
        "MyBoard/web/nuxt-app/app/composables/queries/lane-config.ts",
        "MyBoard/web/nuxt-app/app/lib/workflow.ts",
        "MyBoard/web/nuxt-app/app/pages/board/[[projectSlug]].vue"
      ],
      "prompt": "Return only JSON. A new regression report says project lane overrides can drift from the canonical workflow and confuse the board page.\nTask: prepare a concrete patch plan.\nContext files:\n1. MyBoard/app/models.py: default_lane_sequence() and workflow enums are canonical.\n2. MyBoard/app/api.py: organization and project lane-config endpoints live here.\n3. MyBoard/tests/api/test_lane_config.py: round-trip and inheritance tests already exist.\n4. MyBoard/web/nuxt-app/app/composables/queries/lane-config.ts: frontend query behavior.\n5. MyBoard/web/nuxt-app/app/lib/workflow.ts: frontend lane semantics.\n6. MyBoard/web/nuxt-app/app/pages/board/[[projectSlug]].vue: board rendering depends on effective lanes.\nOutput keys exactly: regression_summary, invariants_to_protect, backend_changes, frontend_changes, tests_to_add, rollout_checks.\nConstraints: mention default_lane_sequence, use_organization_default, effective_lanes, /organizations/{organization_id}/lane-config, /projects/{project_id}/lane-config, and do not invent new persistence layers.",
      "required_markers": [
        "default_lane_sequence",
        "use_organization_default",
        "effective_lanes",
        "/organizations/{organization_id}/lane-config",
        "/projects/{project_id}/lane-config",
        "test_lane_config.py"
      ]
    },
    {
      "id": "myboard_api_token_audit_regression_test",
      "title": "API Token Audit Regression Test",
      "category": "tests",
      "format_rule": "pytest_code",
      "num_predict": 900,
      "context_files": [
        "MyBoard/app/api.py",
        "MyBoard/app/models.py",
        "MyBoard/tests/api/test_api_tokens.py",
        "MyBoard/web/nuxt-app/app/composables/queries/api-tokens.ts",
        "MyBoard/web/nuxt-app/app/pages/settings/api-tokens.vue",
        "MyBoard/contracts/myboard-api.openapi.json"
      ],
      "prompt": "Return only Python code. Write one pytest module that hardens the API token lifecycle against an audit-ordering regression.\nContext files:\n1. MyBoard/app/api.py: /api-tokens endpoints, regenerate, revoke, and audits.\n2. MyBoard/app/models.py: APIToken, APITokenAudit, APITokenAction.\n3. MyBoard/tests/api/test_api_tokens.py: existing lifecycle coverage and auth header pattern.\n4. MyBoard/web/nuxt-app/app/composables/queries/api-tokens.ts: frontend sorts audits descending by created timestamp.\n5. MyBoard/web/nuxt-app/app/pages/settings/api-tokens.vue: UI expects regenerated and revoked tokens to refresh correctly.\n6. MyBoard/contracts/myboard-api.openapi.json: contract surface must stay aligned.\nRequirements: include create, machine-use, regenerate, revoke, audit fetch, and assertions that CREATED, REGENERATED, and REVOKED are all present in audit history and the revoked token is inactive. Do not use code fences.",
      "required_markers": [
        "def test_",
        "/api-tokens",
        "/auth/me",
        "APITokenAudit",
        "REGENERATED",
        "REVOKED",
        "CREATED"
      ]
    },
    {
      "id": "myboard_announcements_state_sync_review",
      "title": "Announcements State Sync Review",
      "category": "review",
      "format_rule": "json_dict",
      "num_predict": 700,
      "context_files": [
        "MyBoard/app/api.py",
        "MyBoard/app/models.py",
        "MyBoard/tests/api/test_announcements.py",
        "MyBoard/web/nuxt-app/app/composables/queries/announcements.ts",
        "MyBoard/web/nuxt-app/app/pages/announcements.vue",
        "MyBoard/docs/app-feature-inventory.md"
      ],
      "prompt": "Return only JSON. Review a suspected frontend/backend sync bug where announcement read and dismiss state can diverge after mark-all-read and list refresh.\nContext files:\n1. MyBoard/app/api.py: CRUD, read, unread, dismiss, undismiss, and mark-all-read endpoints.\n2. MyBoard/app/models.py: announcement read/dismiss persistence objects live here.\n3. MyBoard/tests/api/test_announcements.py: current API coverage.\n4. MyBoard/web/nuxt-app/app/composables/queries/announcements.ts: mergeAnnouncement(), mark-all-read cache behavior, include_dismissed handling.\n5. MyBoard/web/nuxt-app/app/pages/announcements.vue: UI depends on query cache correctness.\n6. MyBoard/docs/app-feature-inventory.md: announcements are a user-visible feature surface.\nOutput keys exactly: failure_modes, most_suspicious_cache_paths, backend_contract_checks, frontend_fix_options, regression_tests, rollout_risk.\nConstraints: mention mergeAnnouncement, include_dismissed, /announcements/mark-all-read, /announcements/{announcement_id}/dismiss, /announcements/{announcement_id}/read, and dismissed.",
      "required_markers": [
        "mergeAnnouncement",
        "include_dismissed",
        "/announcements/mark-all-read",
        "/announcements/{announcement_id}/dismiss",
        "/announcements/{announcement_id}/read",
        "dismissed"
      ]
    },
    {
      "id": "myboard_feature_flag_lifecycle_test",
      "title": "Feature Flag Lifecycle Test",
      "category": "tests",
      "format_rule": "pytest_code",
      "num_predict": 900,
      "context_files": [
        "MyBoard/app/api.py",
        "MyBoard/app/models.py",
        "MyBoard/tests/api/test_feature_flags.py",
        "MyBoard/web/nuxt-app/app/composables/queries/feature-flags.ts",
        "MyBoard/web/nuxt-app/app/pages/admin/index.vue",
        "MyBoard/contracts/myboard-api.openapi.json"
      ],
      "prompt": "Return only Python code. Write one pytest module for a real MyBoard feature-flag regression around environment toggles and history.\nContext files:\n1. MyBoard/app/api.py: feature-flag and feature-flag-environment endpoints.\n2. MyBoard/app/models.py: FeatureFlag, FeatureFlagEnvironment, FeatureFlagHistory, FeatureFlagState.\n3. MyBoard/tests/api/test_feature_flags.py: current lifecycle coverage.\n4. MyBoard/web/nuxt-app/app/composables/queries/feature-flags.ts: frontend expects detail and history cache to stay aligned.\n5. MyBoard/web/nuxt-app/app/pages/admin/index.vue: admin console consumes this data.\n6. MyBoard/contracts/myboard-api.openapi.json: response shapes must remain stable.\nRequirements: create two environments, create one flag, toggle dev to enabled with rollout percentage, fetch history, verify latest history action is toggle, verify non-admin toggle is rejected with 403, and verify delete cleanup. Do not invent helper libraries.",
      "required_markers": [
        "def test_",
        "/feature-flags",
        "/feature-flag-environments",
        "/history",
        "rollout_percentage",
        "403",
        "toggle"
      ]
    },
    {
      "id": "myboard_task_bulk_job_debug_packet",
      "title": "Task Bulk Job Debug Packet",
      "category": "debugging",
      "format_rule": "json_dict",
      "num_predict": 750,
      "context_files": [
        "MyBoard/app/api.py",
        "MyBoard/app/models.py",
        "MyBoard/tests/api/test_task_bulk_jobs.py",
        "MyBoard/web/nuxt-app/app/composables/queries/task-bulk.ts",
        "MyBoard/web/nuxt-app/app/composables/queries/boards.ts",
        "MyBoard/web/nuxt-app/app/components/ui/productivity/BulkActionToolbar.vue"
      ],
      "prompt": "Return only JSON. A production-like report says bulk assign jobs complete, but some task detail panels and board lanes stay stale until a hard refresh.\nTask: produce a debug packet grounded in the repo.\nContext files:\n1. MyBoard/app/api.py: /tasks/bulk/jobs, /tasks/bulk/preview, /tasks/{task_id}, /board/snapshot.\n2. MyBoard/app/models.py: TaskBulkJob, TaskBulkJobEntry, task assignment fields.\n3. MyBoard/tests/api/test_task_bulk_jobs.py: happy-path completion and board sync tests.\n4. MyBoard/web/nuxt-app/app/composables/queries/task-bulk.ts: invalidateBulkAffectedTaskCaches() and polling behavior.\n5. MyBoard/web/nuxt-app/app/composables/queries/boards.ts: board query cache consumers.\n6. MyBoard/web/nuxt-app/app/components/ui/productivity/BulkActionToolbar.vue: user trigger surface.\nOutput keys exactly: suspected_root_causes, cache_invalidation_gaps, backend_checks, frontend_checks, additional_tests, smallest_safe_fix.\nConstraints: mention invalidateBulkAffectedTaskCaches, queryKeys.bulkJobs.detail, queryKeys.boards.lanes, /tasks/bulk/jobs/{job_id}, /board/snapshot, and assignee_ids.",
      "required_markers": [
        "invalidateBulkAffectedTaskCaches",
        "queryKeys.bulkJobs.detail",
        "queryKeys.boards.lanes",
        "/tasks/bulk/jobs/{job_id}",
        "/board/snapshot",
        "assignee_ids"
      ]
    },
    {
      "id": "myboard_user_preferences_contract_test",
      "title": "User Preferences Contract Test",
      "category": "tests",
      "format_rule": "pytest_code",
      "num_predict": 950,
      "context_files": [
        "MyBoard/app/api.py",
        "MyBoard/app/models.py",
        "MyBoard/app/services/preferences.py",
        "MyBoard/tests/api/test_user_preferences.py",
        "MyBoard/web/nuxt-app/app/stores/preferences.ts",
        "MyBoard/web/nuxt-app/app/plugins/00-preferences-bootstrap.ts"
      ],
      "prompt": "Return only Python code. Write one pytest module that strengthens the user-preferences contract around nested payload updates and theme preview.\nContext files:\n1. MyBoard/app/api.py: /user/preferences and /user/preferences/theme-preview endpoints.\n2. MyBoard/app/models.py: ThemeMode, ThemePreset, BoardViewPreference, UserPreferences.\n3. MyBoard/app/services/preferences.py: normalization and validation live here.\n4. MyBoard/tests/api/test_user_preferences.py: existing nested payload coverage.\n5. MyBoard/web/nuxt-app/app/stores/preferences.ts: frontend consumes the persisted shape.\n6. MyBoard/web/nuxt-app/app/plugins/00-preferences-bootstrap.ts: bootstrap path depends on stable defaults.\nRequirements: include auth helper, one successful nested update assertion, one invalid timezone assertion, one theme-preview non-persistence assertion, and direct checks for locale, theme preset, and board default lane. Do not emit markdown fences.",
      "required_markers": [
        "def test_",
        "/user/preferences",
        "/user/preferences/theme-preview",
        "ThemePreset",
        "locale",
        "timezone",
        "default_lane"
      ]
    },
    {
      "id": "myboard_orchestration_timeline_forensics",
      "title": "Orchestration Timeline Forensics",
      "category": "forensics",
      "format_rule": "json_dict",
      "num_predict": 800,
      "context_files": [
        "MyBoard/app/api.py",
        "MyBoard/app/models.py",
        "MyBoard/tests/api/test_orchestration_events.py",
        "MyBoard/docs/user flows/orchestration-and-dependency-api-flows.md",
        "MyBoard/web/nuxt-app/app/pages/admin/index.vue",
        "MyBoard/web/nuxt-app/app/composables/queries/meta.ts"
      ],
      "prompt": "Return only JSON. You are investigating a real operator complaint: run history exists, but retry chains and handoff evidence are hard to explain from the admin surface.\nTask: produce a forensics packet.\nContext files:\n1. MyBoard/app/api.py: orchestration event, run, dependency, failure, and timeline endpoints.\n2. MyBoard/app/models.py: OrchestrationRun and related enums and evidence structures.\n3. MyBoard/tests/api/test_orchestration_events.py: canonical event ingestion, retry, and handoff timeline expectations.\n4. MyBoard/docs/user flows/orchestration-and-dependency-api-flows.md: user-visible operator flows.\n5. MyBoard/web/nuxt-app/app/pages/admin/index.vue: admin console surface.\n6. MyBoard/web/nuxt-app/app/composables/queries/meta.ts: operator metadata fetch patterns.\nOutput keys exactly: operator_problem_statement, timeline_questions_to_answer, endpoints_to_query, evidence_fields_that_matter, missing_tests, recommended_ui_improvements.\nConstraints: mention /orchestration/events, /orchestration/runs, /orchestration/dependencies, handoff_requested, run_failed, and retry chain.",
      "required_markers": [
        "/orchestration/events",
        "/orchestration/runs",
        "/orchestration/dependencies",
        "handoff_requested",
        "run_failed",
        "retry"
      ]
    },
    {
      "id": "truthgraph_ingest_log_triage",
      "title": "TruthGraph Ingest Log Triage",
      "category": "cross_repo_debugging",
      "format_rule": "json_dict",
      "num_predict": 800,
      "context_files": [
        "Earth/PHASE2_PROMPT_COMPLEXITY_METRIC_V1.md",
        "TruthGraph/docs/TRUTHGRAPH_DOC_INGESTION_CONTRACT.md",
        "TruthGraph/contracts/doc_ingest_manifest.schema.json",
        "TruthGraph/internal/query/resolve_context.go",
        "TruthGraph/internal/truthgraph/ingest/preflight/preflight.go",
        "TruthGraph/cmd/truthgraph/status.go"
      ],
      "prompt": "Return only JSON. This task mirrors Slobodan's real cross-repo debugging asks. Given a TruthGraph ingest run that discovers repositories but later produces stale or incomplete query answers, produce a triage packet.\nContext files:\n1. Earth/PHASE2_PROMPT_COMPLEXITY_METRIC_V1.md: prompts above threshold should be decomposed.\n2. TruthGraph/docs/TRUTHGRAPH_DOC_INGESTION_CONTRACT.md: intended doc-ingest behavior.\n3. TruthGraph/contracts/doc_ingest_manifest.schema.json: manifest contract surface.\n4. TruthGraph/internal/query/resolve_context.go: context resolution path.\n5. TruthGraph/internal/truthgraph/ingest/preflight/preflight.go: ingest preflight checks.\n6. TruthGraph/cmd/truthgraph/status.go: operator-visible status reporting.\nOutput keys exactly: observed_symptoms, likely_failure_surfaces, preflight_checks, status_gaps, code_paths_to_review, follow_up_commands.\nConstraints: mention doc_ingest_manifest.schema.json, resolve_context, preflight, status, stale index, and prompt complexity. Do not invent files outside the context list.",
      "required_markers": [
        "doc_ingest_manifest.schema.json",
        "resolve_context",
        "preflight",
        "status",
        "stale",
        "prompt complexity"
      ]
    }
  ]
}