pi-config/extensions/smart-router/index.ts

import type {
  ExtensionAPI,
  ExtensionContext,
  BeforeAgentStartEvent,
  BeforeAgentStartEventResult,
  ExtensionCommandContext,
} from "@earendil-works/pi-coding-agent";
import { compress } from "headroom-ai";

// Global state for manual model lock
let isLocked = false;
let lockedModel: any = null;

// Tags that trigger Headroom context compression.
// read / discuss / search skip compression entirely.
const COMPRESS_TAGS = new Set([
  "devops-low",
  "devops-high",
  "code-analysis-low",
  "code-analysis-high",
  "codewrite-low",
  "codewrite-high",
]);

// Minimum message size (in chars) before compression activates.
// ~5K tokens ≈ 20K characters (rough 4:1 ratio).
const COMPRESS_MIN_CHARS = 20_000;

// Current routing tag for the active turn (used by context handler)
let currentTag: string | null = null;

// Model ID mappings for routing
// Cost-conscious: flash/economy for most tasks, Pro only for genuinely complex work
const MODELS: Record<string, { provider: string; id: string }> = {
  "free-core":          { provider: "openrouter", id: "free" },
  "router-eval":        { provider: "openrouter", id: "free" },
  "economy-devops":     { provider: "opencode-go", id: "deepseek/deepseek-v4-flash" },
  "economy-code":       { provider: "opencode-go", id: "deepseek/deepseek-v4-flash" },
  "precision-devops":   { provider: "opencode-go", id: "deepseek/deepseek-v4-flash" },
  "precision-code-high":{ provider: "opencode-go", id: "deepseek/deepseek-v4-pro" },
  "precision-react":    { provider: "opencode-go", id: "deepseek/deepseek-v4-pro" },
  "context-heavy":      { provider: "openrouter", id: "free" },
};

// Thinking level mappings — only set for tasks that truly benefit.
// Everything else keeps the default (off) for speed and cost.
const THINKING: Record<string, string> = {
  "precision-code-high":"high",
  "precision-react":    "high",
};

function getModel(ctx: ExtensionContext, key: string): any {
  const mapping = MODELS[key];
  if (!mapping) return undefined;
  return ctx.modelRegistry.find(mapping.provider, mapping.id);
}

function modelLabel(key: string): string {
  const m = MODELS[key];
  if (!m) return key;
  // Return just the model name for short display (e.g. "v4-flash", "v4-pro", "free")
  const id = m.id.includes("/") ? m.id : `${m.provider}/${m.id}`;
  const parts = id.split("/");
  return parts[parts.length - 1];
}

export default function (pi: ExtensionAPI) {

  // 1. Register /lock-model command
  pi.registerCommand("lock-model", {
    description: "Lock to a specific model. Disables dynamic routing until /unlock-model.",
    handler: async (args: string, ctx: ExtensionCommandContext) => {
      const modelId = (args || "").trim();
      if (!modelId) {
        ctx.ui.notify(
          "Usage: /lock-model <provider/model-id>\nExample: /lock-model openrouter/anthropic/claude-3.5-sonnet",
          "error"
        );
        return;
      }
      const slashIdx = modelId.indexOf("/");
      if (slashIdx < 0) {
        ctx.ui.notify("Invalid model ID format. Use: provider/model-id", "error");
        return;
      }
      const provider = modelId.substring(0, slashIdx);
      const id = modelId.substring(slashIdx + 1);
      let model = ctx.modelRegistry.find(provider, id);
      if (!model) {
        // OpenRouter model IDs include provider prefix (e.g. "openrouter/owl-alpha")
        // so try with the full provider/model-id format
        model = ctx.modelRegistry.find(provider, `${provider}/${id}`);
      }
      if (!model) {
        ctx.ui.notify(`Model not found: ${modelId}`, "error");
        return;
      }
      isLocked = true;
      lockedModel = model;
      pi.setModel(model);
      ctx.ui.notify(`🔒 Router disabled. Model locked to: ${modelId}`, "info");
    },
  });

  // 2. Register /unlock-model command
  pi.registerCommand("unlock-model", {
    description: "Re-enable dynamic prompt routing.",
    handler: async (_args: string, ctx: ExtensionCommandContext) => {
      isLocked = false;
      lockedModel = null;
      ctx.ui.notify("🔓 Router enabled. Prompts will be dynamically analyzed and routed.", "info");
    },
  });

  // 3. Intercept prompts before agent starts
  pi.on("before_agent_start", async (event: BeforeAgentStartEvent, ctx: ExtensionContext): Promise<BeforeAgentStartEventResult | undefined> => {
    const prompt = event.prompt;

    // Pass-through if user locked the model manually
    if (isLocked && lockedModel) {
      pi.setModel(lockedModel);
      ctx.ui.setStatus("router", `🔒 ${lockedModel.id || "unknown"}`);
      return;
    }

    // Skip routing for short prompts / simple greetings
    if (prompt.trim().length < 15) {
      const model = getModel(ctx, "router-eval");
      if (model) pi.setModel(model);
      ctx.ui.setStatus("router", `⚡ ${modelLabel("router-eval")}`);
      return;
    }

    // Show analyzing indicator in status bar
    ctx.ui.setStatus("router", "🤔 Analyzing...");

    try {
      const OPENROUTER_API_KEY = process.env["OPENROUTER_API_KEY"] || "";

      const response = await fetch("https://openrouter.ai/api/v1/chat/completions", {
        method: "POST",
        headers: {
          "Authorization": `Bearer ${OPENROUTER_API_KEY}`,
          "Content-Type": "application/json",
        },
        body: JSON.stringify({
          model: "openrouter/owl-alpha",
          messages: [
            {
              role: "system",
              content:
                'You are a minimal command interceptor router. Analyze the prompt and output ONLY a minified JSON object containing the tag and language. No markdown, no conversation.\n\nValid Tags:\n- "read" (reading/ingesting logs, docs, read-only code)\n- "discuss" (general chat, architectural concepts)\n- "search" (requires web search/external lookups)\n- "devops-low" (editing yaml, single dockerfile, basic bash, env adjustments)\n- "devops-high" (complex multi-container network layers, server crashes)\n- "code-analysis-low" (finding a bug in a short file)\n- "code-analysis-high" (refactoring multiple complex files, tracking syntax errors)\n- "codewrite-low" (generating simple syntax boilerplate or standard functions)\n- "codewrite-high" (complex features, heavy React state, complex PHP architectures)\n\nRespond exactly like this: {"tag":"tag-name","lang":"react|php|python|rust|bash|none"}',
            },
            { role: "user", content: prompt },
          ],
          max_tokens: 35,
          temperature: 0,
        }),
      });

      const payload = await response.json();
      const content = payload?.choices?.[0]?.message?.content?.trim();
      if (!content) {
        const fallback = getModel(ctx, "free-core");
        if (fallback) pi.setModel(fallback);
        ctx.ui.setStatus("router", `⚠️ ${modelLabel("free-core")}`);
        return;
      }

      const decision = JSON.parse(content);
      const tag: string = decision.tag || "read";
      const lang: string = decision.lang || "none";

      // Deterministic routing tree
      let modelKey: string;
      switch (tag) {
        case "read":
        case "discuss":
        case "search":
          modelKey = "free-core";
          break;
        case "devops-low":
          modelKey = "economy-devops";
          break;
        case "devops-high":
          modelKey = "precision-devops";
          break;
        case "code-analysis-low":
          modelKey = "free-core";
          break;
        case "code-analysis-high":
          modelKey = "context-heavy";
          break;
        case "codewrite-low":
          modelKey = "economy-code";
          break;
        case "codewrite-high":
          modelKey = lang === "react" ? "precision-react" : "precision-code-high";
          break;
        default:
          modelKey = "free-core";
      }

      const model = getModel(ctx, modelKey);
      if (model) {
        pi.setModel(model);
      }

      // Set thinking level based on task complexity
      const thinkingLevel = THINKING[modelKey];
      if (thinkingLevel) {
        pi.setThinkingLevel(thinkingLevel as any);
      }

      // Store tag for compression check in context event
      currentTag = tag;

      // Show routing decision in status bar (shortened to avoid truncation)
      ctx.ui.setStatus("router", `${modelLabel(modelKey)}`);

    } catch (_error) {
      // Fallback gracefully on network drops
      const fallback = getModel(ctx, "free-core");
      if (fallback) pi.setModel(fallback);
      ctx.ui.setStatus("router", `⚠️ fallback ${modelLabel("free-core")}`);
    }
  });

  // 4. Compress large contexts before LLM turns (Headroom)
  pi.on("context", async (event, ctx) => {
    // Only compress for analysis/coding/devops tags
    if (!currentTag || !COMPRESS_TAGS.has(currentTag)) return;

    // Quick size check before calling the proxy
    const totalChars = JSON.stringify(event.messages).length;
    if (totalChars < COMPRESS_MIN_CHARS) return;

    try {
      const result = await compress(event.messages, {
        baseUrl: "http://192.168.20.13:8787",
        fallback: true,
        timeout: 15_000,
      });
      if (result.messages && result.messages.length > 0) {
        const saved = ((result.tokensBefore - result.tokensAfter) / result.tokensBefore * 100).toFixed(0);
        ctx.ui.setStatus("compression", `📦${saved}%`);
        return { messages: result.messages };
      }
    } catch {
      // Proxy down — pass through (fallback: true already handles transport errors)
      ctx.ui.setStatus("compression", "⚠️");
    }
  });
}