diff --git a/extensions/web-fetch/index.ts b/extensions/web-fetch/index.ts new file mode 100644 index 0000000..27fd31e --- /dev/null +++ b/extensions/web-fetch/index.ts @@ -0,0 +1,650 @@ +import type { ExtensionAPI } from "@mariozechner/pi-coding-agent"; +import { Type } from "typebox"; +import { Text } from "@mariozechner/pi-tui"; +import { Readability } from "@mozilla/readability"; +import { parseHTML } from "linkedom"; +import TurndownService from "turndown"; + +const USER_AGENT = + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"; +const DEFAULT_TIMEOUT_MS = 30000; +const MAX_RESPONSE_SIZE = 5 * 1024 * 1024; +const MAX_PDF_SIZE = 20 * 1024 * 1024; +const MIN_USEFUL_CONTENT = 500; +const JINA_READER_BASE = "https://r.jina.ai/"; +const JINA_TIMEOUT_MS = 30000; + +const turndown = new TurndownService({ + headingStyle: "atx", + codeBlockStyle: "fenced", +}); + +// ── Types ──────────────────────────────────────────────────────────── + +interface FetchResult { + url: string; + title: string; + content: string; + error: string | null; +} + +// ── PDF Extraction ─────────────────────────────────────────────────── + +function isPDF(url: string, contentType?: string): boolean { + if (contentType?.includes("application/pdf")) return true; + try { + return new URL(url).pathname.toLowerCase().endsWith(".pdf"); + } catch { + return false; + } +} + +async function extractPDF( + buffer: ArrayBuffer, + url: string, +): Promise { + const { getDocumentProxy } = await import("unpdf"); + const pdf = await getDocumentProxy(new Uint8Array(buffer)); + + const metadata = await pdf.getMetadata(); + const metadataInfo = + metadata.info && typeof metadata.info === "object" + ? (metadata.info as Record) + : null; + + const metaTitle = + typeof metadataInfo?.Title === "string" + ? metadataInfo.Title.trim() + : ""; + const metaAuthor = + typeof metadataInfo?.Author === "string" + ? metadataInfo.Author.trim() + : ""; + + let urlTitle = "document"; + try { + const { basename } = await import("node:path"); + urlTitle = + basename(new URL(url).pathname, ".pdf") + .replace(/[_-]+/g, " ") + .trim() || "document"; + } catch { + /* ignore */ + } + const title = metaTitle || urlTitle; + + const maxPages = Math.min(pdf.numPages, 100); + const pages: string[] = []; + for (let i = 1; i <= maxPages; i++) { + const page = await pdf.getPage(i); + const textContent = await page.getTextContent(); + const pageText = textContent.items + .map((item: unknown) => (item as { str?: string }).str || "") + .join(" ") + .replace(/\s+/g, " ") + .trim(); + if (pageText) pages.push(pageText); + } + + const lines: string[] = [ + `# ${title}`, + "", + `> Source: ${url}`, + `> Pages: ${pdf.numPages}${pdf.numPages > maxPages ? ` (extracted first ${maxPages})` : ""}`, + ]; + if (metaAuthor) lines.push(`> Author: ${metaAuthor}`); + lines.push("", "---", ""); + lines.push(pages.join("\n\n")); + + if (pdf.numPages > maxPages) { + lines.push( + "", + "---", + "", + `*[Truncated: Only first ${maxPages} of ${pdf.numPages} pages extracted]*`, + ); + } + + return { url, title, content: lines.join("\n"), error: null }; +} + +// ── RSC Content Extraction (Next.js) ───────────────────────────────── + +function extractRSCContent( + html: string, +): { title: string; content: string } | null { + if (!html.includes("self.__next_f.push")) return null; + + const chunkMap = new Map(); + const scriptRegex = + /