import type { ExtensionAPI, ExtensionContext, BeforeAgentStartEvent, BeforeAgentStartEventResult, ExtensionCommandContext, } from "@earendil-works/pi-coding-agent"; import { compress } from "headroom-ai"; // Global state for manual model lock let isLocked = false; let lockedModel: any = null; // Tags that trigger Headroom context compression. // read / discuss / search skip compression entirely. const COMPRESS_TAGS = new Set([ "devops-low", "devops-high", "code-analysis-low", "code-analysis-high", "codewrite-low", "codewrite-high", ]); // Minimum message size (in chars) before compression activates. // ~5K tokens ≈ 20K characters (rough 4:1 ratio). const COMPRESS_MIN_CHARS = 20_000; // Current routing tag for the active turn (used by context handler) let currentTag: string | null = null; // Model ID mappings for routing // Cost-conscious: flash/economy for most tasks, Pro only for genuinely complex work const MODELS: Record = { "free-core": { provider: "openrouter", id: "free" }, "router-eval": { provider: "openrouter", id: "free" }, "economy-devops": { provider: "opencode-go", id: "deepseek/deepseek-v4-flash" }, "economy-code": { provider: "opencode-go", id: "deepseek/deepseek-v4-flash" }, "precision-devops": { provider: "opencode-go", id: "deepseek/deepseek-v4-flash" }, "precision-code-high":{ provider: "opencode-go", id: "deepseek/deepseek-v4-pro" }, "precision-react": { provider: "opencode-go", id: "deepseek/deepseek-v4-pro" }, "context-heavy": { provider: "openrouter", id: "free" }, }; // Thinking level mappings — only set for tasks that truly benefit. // Everything else keeps the default (off) for speed and cost. const THINKING: Record = { "precision-code-high":"high", "precision-react": "high", }; function getModel(ctx: ExtensionContext, key: string): any { const mapping = MODELS[key]; if (!mapping) return undefined; return ctx.modelRegistry.find(mapping.provider, mapping.id); } function modelLabel(key: string): string { const m = MODELS[key]; if (!m) return key; // Return just the model name for short display (e.g. "v4-flash", "v4-pro", "free") const id = m.id.includes("/") ? m.id : `${m.provider}/${m.id}`; const parts = id.split("/"); return parts[parts.length - 1]; } export default function (pi: ExtensionAPI) { // 1. Register /lock-model command pi.registerCommand("lock-model", { description: "Lock to a specific model. Disables dynamic routing until /unlock-model.", handler: async (args: string, ctx: ExtensionCommandContext) => { const modelId = (args || "").trim(); if (!modelId) { ctx.ui.notify( "Usage: /lock-model \nExample: /lock-model openrouter/anthropic/claude-3.5-sonnet", "error" ); return; } const slashIdx = modelId.indexOf("/"); if (slashIdx < 0) { ctx.ui.notify("Invalid model ID format. Use: provider/model-id", "error"); return; } const provider = modelId.substring(0, slashIdx); const id = modelId.substring(slashIdx + 1); let model = ctx.modelRegistry.find(provider, id); if (!model) { // OpenRouter model IDs include provider prefix (e.g. "openrouter/owl-alpha") // so try with the full provider/model-id format model = ctx.modelRegistry.find(provider, `${provider}/${id}`); } if (!model) { ctx.ui.notify(`Model not found: ${modelId}`, "error"); return; } isLocked = true; lockedModel = model; pi.setModel(model); ctx.ui.notify(`🔒 Router disabled. Model locked to: ${modelId}`, "info"); }, }); // 2. Register /unlock-model command pi.registerCommand("unlock-model", { description: "Re-enable dynamic prompt routing.", handler: async (_args: string, ctx: ExtensionCommandContext) => { isLocked = false; lockedModel = null; ctx.ui.notify("🔓 Router enabled. Prompts will be dynamically analyzed and routed.", "info"); }, }); // 3. Intercept prompts before agent starts pi.on("before_agent_start", async (event: BeforeAgentStartEvent, ctx: ExtensionContext): Promise => { const prompt = event.prompt; // Pass-through if user locked the model manually if (isLocked && lockedModel) { pi.setModel(lockedModel); ctx.ui.setStatus("router", `🔒 ${lockedModel.id || "unknown"}`); return; } // Skip routing for short prompts / simple greetings if (prompt.trim().length < 15) { const model = getModel(ctx, "router-eval"); if (model) pi.setModel(model); ctx.ui.setStatus("router", `⚡ ${modelLabel("router-eval")}`); return; } // Show analyzing indicator in status bar ctx.ui.setStatus("router", "🤔 Analyzing..."); try { const OPENROUTER_API_KEY = process.env["OPENROUTER_API_KEY"] || ""; const response = await fetch("https://openrouter.ai/api/v1/chat/completions", { method: "POST", headers: { "Authorization": `Bearer ${OPENROUTER_API_KEY}`, "Content-Type": "application/json", }, body: JSON.stringify({ model: "openrouter/owl-alpha", messages: [ { role: "system", content: 'You are a minimal command interceptor router. Analyze the prompt and output ONLY a minified JSON object containing the tag and language. No markdown, no conversation.\n\nValid Tags:\n- "read" (reading/ingesting logs, docs, read-only code)\n- "discuss" (general chat, architectural concepts)\n- "search" (requires web search/external lookups)\n- "devops-low" (editing yaml, single dockerfile, basic bash, env adjustments)\n- "devops-high" (complex multi-container network layers, server crashes)\n- "code-analysis-low" (finding a bug in a short file)\n- "code-analysis-high" (refactoring multiple complex files, tracking syntax errors)\n- "codewrite-low" (generating simple syntax boilerplate or standard functions)\n- "codewrite-high" (complex features, heavy React state, complex PHP architectures)\n\nRespond exactly like this: {"tag":"tag-name","lang":"react|php|python|rust|bash|none"}', }, { role: "user", content: prompt }, ], max_tokens: 35, temperature: 0, }), }); const payload = await response.json(); const content = payload?.choices?.[0]?.message?.content?.trim(); if (!content) { const fallback = getModel(ctx, "free-core"); if (fallback) pi.setModel(fallback); ctx.ui.setStatus("router", `⚠️ ${modelLabel("free-core")}`); return; } const decision = JSON.parse(content); const tag: string = decision.tag || "read"; const lang: string = decision.lang || "none"; // Deterministic routing tree let modelKey: string; switch (tag) { case "read": case "discuss": case "search": modelKey = "free-core"; break; case "devops-low": modelKey = "economy-devops"; break; case "devops-high": modelKey = "precision-devops"; break; case "code-analysis-low": modelKey = "free-core"; break; case "code-analysis-high": modelKey = "context-heavy"; break; case "codewrite-low": modelKey = "economy-code"; break; case "codewrite-high": modelKey = lang === "react" ? "precision-react" : "precision-code-high"; break; default: modelKey = "free-core"; } const model = getModel(ctx, modelKey); if (model) { pi.setModel(model); } // Set thinking level based on task complexity const thinkingLevel = THINKING[modelKey]; if (thinkingLevel) { pi.setThinkingLevel(thinkingLevel as any); } // Store tag for compression check in context event currentTag = tag; // Show routing decision in status bar (shortened to avoid truncation) ctx.ui.setStatus("router", `${modelLabel(modelKey)}`); } catch (_error) { // Fallback gracefully on network drops const fallback = getModel(ctx, "free-core"); if (fallback) pi.setModel(fallback); ctx.ui.setStatus("router", `⚠️ fallback ${modelLabel("free-core")}`); } }); // 4. Compress large contexts before LLM turns (Headroom) pi.on("context", async (event, ctx) => { // Only compress for analysis/coding/devops tags if (!currentTag || !COMPRESS_TAGS.has(currentTag)) return; // Quick size check before calling the proxy const totalChars = JSON.stringify(event.messages).length; if (totalChars < COMPRESS_MIN_CHARS) return; try { const result = await compress(event.messages, { baseUrl: "http://192.168.20.13:8787", fallback: true, timeout: 15_000, }); if (result.messages && result.messages.length > 0) { const saved = ((result.tokensBefore - result.tokensAfter) / result.tokensBefore * 100).toFixed(0); ctx.ui.setStatus("compression", `📦${saved}%`); return { messages: result.messages }; } } catch { // Proxy down — pass through (fallback: true already handles transport errors) ctx.ui.setStatus("compression", "⚠️"); } }); }