{ "suite_name": "overnight-python-telemetry-v2-real-context", "version": "2.0", "purpose": "A deterministic overnight suite for evaluating big and small Ollama models on vps50 with harder multi-file prompts shaped after Slobodan's real implementation, review, debugging, and orchestration asks.", "models": [ { "model": "qwen32-coder-32k", "display_name": "Qwen32 Coder 32k", "size_label": "32b" }, { "model": "qwen14-coder-32k", "display_name": "Qwen14 Coder 32k", "size_label": "14b" }, { "model": "codestral-32k", "display_name": "Codestral 32k", "size_label": "22b" }, { "model": "codellama34-16k", "display_name": "CodeLlama 34 16k", "size_label": "34b" }, { "model": "phind34-16k", "display_name": "Phind 34 16k", "size_label": "34b" }, { "model": "qwen14-general-32k", "display_name": "Qwen14 General 32k", "size_label": "14b" }, { "model": "qwen2.5-coder:3b", "display_name": "Qwen2.5 Coder 3B", "size_label": "3b" }, { "model": "qwen2.5-coder:1.5b", "display_name": "Qwen2.5 Coder 1.5B", "size_label": "1.5b" }, { "model": "qwen2.5:3b", "display_name": "Qwen2.5 3B", "size_label": "3b" }, { "model": "llama3.2:3b", "display_name": "Llama 3.2 3B", "size_label": "3b" }, { "model": "phi3", "display_name": "Phi-3 Mini", "size_label": "3.8b" } ], "questions": [ { "id": "myboard_auth_redirect_triage", "title": "MyBoard Auth Redirect Triage", "category": "debugging", "format_rule": "json_dict", "num_predict": 700, "context_files": [ "MyBoard/app/api.py", "MyBoard/web/nuxt-app/app/composables/useSession.ts", "MyBoard/web/nuxt-app/app/middleware/auth.global.ts", "MyBoard/web/nuxt-app/app/pages/login.vue", "MyBoard/tests/browser/flow-coverage-manifest.json", "MyBoard/docs/user flows/access-onboarding-and-account-flows.md" ], "prompt": "Return only JSON. You are debugging a real MyBoard issue where password login can succeed but the user still lands on the wrong route or loses tenant context.\nTask: produce a repo-grounded triage packet.\nContext files:\n1. MyBoard/app/api.py: exposes /auth/login and /auth/me.\n2. MyBoard/web/nuxt-app/app/composables/useSession.ts: loginWithPassword(), setSession(), fetchContext(), applyTenant(), applyProject().\n3. MyBoard/web/nuxt-app/app/middleware/auth.global.ts: redirects to /login, /tenant-missing, /403 and reads myboard:post-login-redirect.\n4. MyBoard/web/nuxt-app/app/pages/login.vue: saveRedirectTarget(), redirectAfterLogin(), resolveOidcErrorMessage().\n5. MyBoard/tests/browser/flow-coverage-manifest.json: login and tenant-missing flows are acceptance coverage.\n6. MyBoard/docs/user flows/access-onboarding-and-account-flows.md: expected post-login workspace behavior.\nOutput keys exactly: issue_summary, likely_root_causes, backend_touchpoints, frontend_touchpoints, tests_to_run, safe_fix_plan.\nConstraints: mention /auth/login, /auth/me, myboard:post-login-redirect, tenant-missing, X-Tenant, and do not invent files outside the context list.", "required_markers": [ "/auth/login", "/auth/me", "useSession.ts", "auth.global.ts", "login.vue", "tenant-missing", "myboard:post-login-redirect", "X-Tenant" ] }, { "id": "myboard_board_snapshot_regression_test", "title": "Board Snapshot Regression Test", "category": "tests", "format_rule": "pytest_code", "num_predict": 900, "context_files": [ "MyBoard/app/api.py", "MyBoard/app/models.py", "MyBoard/tests/api/test_board_snapshot.py", "MyBoard/tests/api/test_task_bulk_jobs.py", "MyBoard/web/nuxt-app/app/composables/queries/boards.ts", "MyBoard/web/nuxt-app/app/pages/board/[[projectSlug]].vue" ], "prompt": "Return only Python code. Write one focused pytest module for a real MyBoard regression around /board/snapshot after bulk task assignment and lane movement.\nContext files:\n1. MyBoard/app/api.py: exposes /board/snapshot, /tasks/{task_id}, and bulk job endpoints.\n2. MyBoard/app/models.py: workflow statuses and lane ordering are defined here.\n3. MyBoard/tests/api/test_board_snapshot.py: current lane-count and project-scope coverage.\n4. MyBoard/tests/api/test_task_bulk_jobs.py: helper patterns for creating stories/tasks and checking snapshot sync after assign/move.\n5. MyBoard/web/nuxt-app/app/composables/queries/boards.ts: frontend expects board data to stay lane-consistent.\n6. MyBoard/web/nuxt-app/app/pages/board/[[projectSlug]].vue: board page consumes the snapshot.\nRequirements: include async auth helper, create at least one story and one task, move the task to session, verify /board/snapshot returns the task in the session lane, and assert assignee_ids survive the move. Do not invent endpoints outside the context list.", "required_markers": [ "def test_", "/board/snapshot", "/tasks/", "session", "assignee_ids", "story_id", "api_client" ] }, { "id": "myboard_lane_config_patch_plan", "title": "Lane Config Patch Plan", "category": "planning", "format_rule": "json_dict", "num_predict": 700, "context_files": [ "MyBoard/app/models.py", "MyBoard/app/api.py", "MyBoard/tests/api/test_lane_config.py", "MyBoard/web/nuxt-app/app/composables/queries/lane-config.ts", "MyBoard/web/nuxt-app/app/lib/workflow.ts", "MyBoard/web/nuxt-app/app/pages/board/[[projectSlug]].vue" ], "prompt": "Return only JSON. A new regression report says project lane overrides can drift from the canonical workflow and confuse the board page.\nTask: prepare a concrete patch plan.\nContext files:\n1. MyBoard/app/models.py: default_lane_sequence() and workflow enums are canonical.\n2. MyBoard/app/api.py: organization and project lane-config endpoints live here.\n3. MyBoard/tests/api/test_lane_config.py: round-trip and inheritance tests already exist.\n4. MyBoard/web/nuxt-app/app/composables/queries/lane-config.ts: frontend query behavior.\n5. MyBoard/web/nuxt-app/app/lib/workflow.ts: frontend lane semantics.\n6. MyBoard/web/nuxt-app/app/pages/board/[[projectSlug]].vue: board rendering depends on effective lanes.\nOutput keys exactly: regression_summary, invariants_to_protect, backend_changes, frontend_changes, tests_to_add, rollout_checks.\nConstraints: mention default_lane_sequence, use_organization_default, effective_lanes, /organizations/{organization_id}/lane-config, /projects/{project_id}/lane-config, and do not invent new persistence layers.", "required_markers": [ "default_lane_sequence", "use_organization_default", "effective_lanes", "/organizations/{organization_id}/lane-config", "/projects/{project_id}/lane-config", "test_lane_config.py" ] }, { "id": "myboard_api_token_audit_regression_test", "title": "API Token Audit Regression Test", "category": "tests", "format_rule": "pytest_code", "num_predict": 900, "context_files": [ "MyBoard/app/api.py", "MyBoard/app/models.py", "MyBoard/tests/api/test_api_tokens.py", "MyBoard/web/nuxt-app/app/composables/queries/api-tokens.ts", "MyBoard/web/nuxt-app/app/pages/settings/api-tokens.vue", "MyBoard/contracts/myboard-api.openapi.json" ], "prompt": "Return only Python code. Write one pytest module that hardens the API token lifecycle against an audit-ordering regression.\nContext files:\n1. MyBoard/app/api.py: /api-tokens endpoints, regenerate, revoke, and audits.\n2. MyBoard/app/models.py: APIToken, APITokenAudit, APITokenAction.\n3. MyBoard/tests/api/test_api_tokens.py: existing lifecycle coverage and auth header pattern.\n4. MyBoard/web/nuxt-app/app/composables/queries/api-tokens.ts: frontend sorts audits descending by created timestamp.\n5. MyBoard/web/nuxt-app/app/pages/settings/api-tokens.vue: UI expects regenerated and revoked tokens to refresh correctly.\n6. MyBoard/contracts/myboard-api.openapi.json: contract surface must stay aligned.\nRequirements: include create, machine-use, regenerate, revoke, audit fetch, and assertions that CREATED, REGENERATED, and REVOKED are all present in audit history and the revoked token is inactive. Do not use code fences.", "required_markers": [ "def test_", "/api-tokens", "/auth/me", "APITokenAudit", "REGENERATED", "REVOKED", "CREATED" ] }, { "id": "myboard_announcements_state_sync_review", "title": "Announcements State Sync Review", "category": "review", "format_rule": "json_dict", "num_predict": 700, "context_files": [ "MyBoard/app/api.py", "MyBoard/app/models.py", "MyBoard/tests/api/test_announcements.py", "MyBoard/web/nuxt-app/app/composables/queries/announcements.ts", "MyBoard/web/nuxt-app/app/pages/announcements.vue", "MyBoard/docs/app-feature-inventory.md" ], "prompt": "Return only JSON. Review a suspected frontend/backend sync bug where announcement read and dismiss state can diverge after mark-all-read and list refresh.\nContext files:\n1. MyBoard/app/api.py: CRUD, read, unread, dismiss, undismiss, and mark-all-read endpoints.\n2. MyBoard/app/models.py: announcement read/dismiss persistence objects live here.\n3. MyBoard/tests/api/test_announcements.py: current API coverage.\n4. MyBoard/web/nuxt-app/app/composables/queries/announcements.ts: mergeAnnouncement(), mark-all-read cache behavior, include_dismissed handling.\n5. MyBoard/web/nuxt-app/app/pages/announcements.vue: UI depends on query cache correctness.\n6. MyBoard/docs/app-feature-inventory.md: announcements are a user-visible feature surface.\nOutput keys exactly: failure_modes, most_suspicious_cache_paths, backend_contract_checks, frontend_fix_options, regression_tests, rollout_risk.\nConstraints: mention mergeAnnouncement, include_dismissed, /announcements/mark-all-read, /announcements/{announcement_id}/dismiss, /announcements/{announcement_id}/read, and dismissed.", "required_markers": [ "mergeAnnouncement", "include_dismissed", "/announcements/mark-all-read", "/announcements/{announcement_id}/dismiss", "/announcements/{announcement_id}/read", "dismissed" ] }, { "id": "myboard_feature_flag_lifecycle_test", "title": "Feature Flag Lifecycle Test", "category": "tests", "format_rule": "pytest_code", "num_predict": 900, "context_files": [ "MyBoard/app/api.py", "MyBoard/app/models.py", "MyBoard/tests/api/test_feature_flags.py", "MyBoard/web/nuxt-app/app/composables/queries/feature-flags.ts", "MyBoard/web/nuxt-app/app/pages/admin/index.vue", "MyBoard/contracts/myboard-api.openapi.json" ], "prompt": "Return only Python code. Write one pytest module for a real MyBoard feature-flag regression around environment toggles and history.\nContext files:\n1. MyBoard/app/api.py: feature-flag and feature-flag-environment endpoints.\n2. MyBoard/app/models.py: FeatureFlag, FeatureFlagEnvironment, FeatureFlagHistory, FeatureFlagState.\n3. MyBoard/tests/api/test_feature_flags.py: current lifecycle coverage.\n4. MyBoard/web/nuxt-app/app/composables/queries/feature-flags.ts: frontend expects detail and history cache to stay aligned.\n5. MyBoard/web/nuxt-app/app/pages/admin/index.vue: admin console consumes this data.\n6. MyBoard/contracts/myboard-api.openapi.json: response shapes must remain stable.\nRequirements: create two environments, create one flag, toggle dev to enabled with rollout percentage, fetch history, verify latest history action is toggle, verify non-admin toggle is rejected with 403, and verify delete cleanup. Do not invent helper libraries.", "required_markers": [ "def test_", "/feature-flags", "/feature-flag-environments", "/history", "rollout_percentage", "403", "toggle" ] }, { "id": "myboard_task_bulk_job_debug_packet", "title": "Task Bulk Job Debug Packet", "category": "debugging", "format_rule": "json_dict", "num_predict": 750, "context_files": [ "MyBoard/app/api.py", "MyBoard/app/models.py", "MyBoard/tests/api/test_task_bulk_jobs.py", "MyBoard/web/nuxt-app/app/composables/queries/task-bulk.ts", "MyBoard/web/nuxt-app/app/composables/queries/boards.ts", "MyBoard/web/nuxt-app/app/components/ui/productivity/BulkActionToolbar.vue" ], "prompt": "Return only JSON. A production-like report says bulk assign jobs complete, but some task detail panels and board lanes stay stale until a hard refresh.\nTask: produce a debug packet grounded in the repo.\nContext files:\n1. MyBoard/app/api.py: /tasks/bulk/jobs, /tasks/bulk/preview, /tasks/{task_id}, /board/snapshot.\n2. MyBoard/app/models.py: TaskBulkJob, TaskBulkJobEntry, task assignment fields.\n3. MyBoard/tests/api/test_task_bulk_jobs.py: happy-path completion and board sync tests.\n4. MyBoard/web/nuxt-app/app/composables/queries/task-bulk.ts: invalidateBulkAffectedTaskCaches() and polling behavior.\n5. MyBoard/web/nuxt-app/app/composables/queries/boards.ts: board query cache consumers.\n6. MyBoard/web/nuxt-app/app/components/ui/productivity/BulkActionToolbar.vue: user trigger surface.\nOutput keys exactly: suspected_root_causes, cache_invalidation_gaps, backend_checks, frontend_checks, additional_tests, smallest_safe_fix.\nConstraints: mention invalidateBulkAffectedTaskCaches, queryKeys.bulkJobs.detail, queryKeys.boards.lanes, /tasks/bulk/jobs/{job_id}, /board/snapshot, and assignee_ids.", "required_markers": [ "invalidateBulkAffectedTaskCaches", "queryKeys.bulkJobs.detail", "queryKeys.boards.lanes", "/tasks/bulk/jobs/{job_id}", "/board/snapshot", "assignee_ids" ] }, { "id": "myboard_user_preferences_contract_test", "title": "User Preferences Contract Test", "category": "tests", "format_rule": "pytest_code", "num_predict": 950, "context_files": [ "MyBoard/app/api.py", "MyBoard/app/models.py", "MyBoard/app/services/preferences.py", "MyBoard/tests/api/test_user_preferences.py", "MyBoard/web/nuxt-app/app/stores/preferences.ts", "MyBoard/web/nuxt-app/app/plugins/00-preferences-bootstrap.ts" ], "prompt": "Return only Python code. Write one pytest module that strengthens the user-preferences contract around nested payload updates and theme preview.\nContext files:\n1. MyBoard/app/api.py: /user/preferences and /user/preferences/theme-preview endpoints.\n2. MyBoard/app/models.py: ThemeMode, ThemePreset, BoardViewPreference, UserPreferences.\n3. MyBoard/app/services/preferences.py: normalization and validation live here.\n4. MyBoard/tests/api/test_user_preferences.py: existing nested payload coverage.\n5. MyBoard/web/nuxt-app/app/stores/preferences.ts: frontend consumes the persisted shape.\n6. MyBoard/web/nuxt-app/app/plugins/00-preferences-bootstrap.ts: bootstrap path depends on stable defaults.\nRequirements: include auth helper, one successful nested update assertion, one invalid timezone assertion, one theme-preview non-persistence assertion, and direct checks for locale, theme preset, and board default lane. Do not emit markdown fences.", "required_markers": [ "def test_", "/user/preferences", "/user/preferences/theme-preview", "ThemePreset", "locale", "timezone", "default_lane" ] }, { "id": "myboard_orchestration_timeline_forensics", "title": "Orchestration Timeline Forensics", "category": "forensics", "format_rule": "json_dict", "num_predict": 800, "context_files": [ "MyBoard/app/api.py", "MyBoard/app/models.py", "MyBoard/tests/api/test_orchestration_events.py", "MyBoard/docs/user flows/orchestration-and-dependency-api-flows.md", "MyBoard/web/nuxt-app/app/pages/admin/index.vue", "MyBoard/web/nuxt-app/app/composables/queries/meta.ts" ], "prompt": "Return only JSON. You are investigating a real operator complaint: run history exists, but retry chains and handoff evidence are hard to explain from the admin surface.\nTask: produce a forensics packet.\nContext files:\n1. MyBoard/app/api.py: orchestration event, run, dependency, failure, and timeline endpoints.\n2. MyBoard/app/models.py: OrchestrationRun and related enums and evidence structures.\n3. MyBoard/tests/api/test_orchestration_events.py: canonical event ingestion, retry, and handoff timeline expectations.\n4. MyBoard/docs/user flows/orchestration-and-dependency-api-flows.md: user-visible operator flows.\n5. MyBoard/web/nuxt-app/app/pages/admin/index.vue: admin console surface.\n6. MyBoard/web/nuxt-app/app/composables/queries/meta.ts: operator metadata fetch patterns.\nOutput keys exactly: operator_problem_statement, timeline_questions_to_answer, endpoints_to_query, evidence_fields_that_matter, missing_tests, recommended_ui_improvements.\nConstraints: mention /orchestration/events, /orchestration/runs, /orchestration/dependencies, handoff_requested, run_failed, and retry chain.", "required_markers": [ "/orchestration/events", "/orchestration/runs", "/orchestration/dependencies", "handoff_requested", "run_failed", "retry" ] }, { "id": "truthgraph_ingest_log_triage", "title": "TruthGraph Ingest Log Triage", "category": "cross_repo_debugging", "format_rule": "json_dict", "num_predict": 800, "context_files": [ "Earth/PHASE2_PROMPT_COMPLEXITY_METRIC_V1.md", "TruthGraph/docs/TRUTHGRAPH_DOC_INGESTION_CONTRACT.md", "TruthGraph/contracts/doc_ingest_manifest.schema.json", "TruthGraph/internal/query/resolve_context.go", "TruthGraph/internal/truthgraph/ingest/preflight/preflight.go", "TruthGraph/cmd/truthgraph/status.go" ], "prompt": "Return only JSON. This task mirrors Slobodan's real cross-repo debugging asks. Given a TruthGraph ingest run that discovers repositories but later produces stale or incomplete query answers, produce a triage packet.\nContext files:\n1. Earth/PHASE2_PROMPT_COMPLEXITY_METRIC_V1.md: prompts above threshold should be decomposed.\n2. TruthGraph/docs/TRUTHGRAPH_DOC_INGESTION_CONTRACT.md: intended doc-ingest behavior.\n3. TruthGraph/contracts/doc_ingest_manifest.schema.json: manifest contract surface.\n4. TruthGraph/internal/query/resolve_context.go: context resolution path.\n5. TruthGraph/internal/truthgraph/ingest/preflight/preflight.go: ingest preflight checks.\n6. TruthGraph/cmd/truthgraph/status.go: operator-visible status reporting.\nOutput keys exactly: observed_symptoms, likely_failure_surfaces, preflight_checks, status_gaps, code_paths_to_review, follow_up_commands.\nConstraints: mention doc_ingest_manifest.schema.json, resolve_context, preflight, status, stale index, and prompt complexity. Do not invent files outside the context list.", "required_markers": [ "doc_ingest_manifest.schema.json", "resolve_context", "preflight", "status", "stale", "prompt complexity" ] } ] }