import type { ExtensionAPI } from "@mariozechner/pi-coding-agent"; import { Type } from "typebox"; import { Text } from "@mariozechner/pi-tui"; import { Readability } from "@mozilla/readability"; import { parseHTML } from "linkedom"; import TurndownService from "turndown"; const USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"; const DEFAULT_TIMEOUT_MS = 30000; const MAX_RESPONSE_SIZE = 5 * 1024 * 1024; const MAX_PDF_SIZE = 20 * 1024 * 1024; const MIN_USEFUL_CONTENT = 500; const JINA_READER_BASE = "https://r.jina.ai/"; const JINA_TIMEOUT_MS = 30000; const turndown = new TurndownService({ headingStyle: "atx", codeBlockStyle: "fenced", }); // ── Types ──────────────────────────────────────────────────────────── interface FetchResult { url: string; title: string; content: string; error: string | null; } // ── PDF Extraction ─────────────────────────────────────────────────── function isPDF(url: string, contentType?: string): boolean { if (contentType?.includes("application/pdf")) return true; try { return new URL(url).pathname.toLowerCase().endsWith(".pdf"); } catch { return false; } } async function extractPDF( buffer: ArrayBuffer, url: string, ): Promise { const { getDocumentProxy } = await import("unpdf"); const pdf = await getDocumentProxy(new Uint8Array(buffer)); const metadata = await pdf.getMetadata(); const metadataInfo = metadata.info && typeof metadata.info === "object" ? (metadata.info as Record) : null; const metaTitle = typeof metadataInfo?.Title === "string" ? metadataInfo.Title.trim() : ""; const metaAuthor = typeof metadataInfo?.Author === "string" ? metadataInfo.Author.trim() : ""; let urlTitle = "document"; try { const { basename } = await import("node:path"); urlTitle = basename(new URL(url).pathname, ".pdf") .replace(/[_-]+/g, " ") .trim() || "document"; } catch { /* ignore */ } const title = metaTitle || urlTitle; const maxPages = Math.min(pdf.numPages, 100); const pages: string[] = []; for (let i = 1; i <= maxPages; i++) { const page = await pdf.getPage(i); const textContent = await page.getTextContent(); const pageText = textContent.items .map((item: unknown) => (item as { str?: string }).str || "") .join(" ") .replace(/\s+/g, " ") .trim(); if (pageText) pages.push(pageText); } const lines: string[] = [ `# ${title}`, "", `> Source: ${url}`, `> Pages: ${pdf.numPages}${pdf.numPages > maxPages ? ` (extracted first ${maxPages})` : ""}`, ]; if (metaAuthor) lines.push(`> Author: ${metaAuthor}`); lines.push("", "---", ""); lines.push(pages.join("\n\n")); if (pdf.numPages > maxPages) { lines.push( "", "---", "", `*[Truncated: Only first ${maxPages} of ${pdf.numPages} pages extracted]*`, ); } return { url, title, content: lines.join("\n"), error: null }; } // ── RSC Content Extraction (Next.js) ───────────────────────────────── function extractRSCContent( html: string, ): { title: string; content: string } | null { if (!html.includes("self.__next_f.push")) return null; const chunkMap = new Map(); const scriptRegex = /