Initial commit: Multi-service AI agent system

- Frontend: Vite + React + TypeScript chat interface - Backend: FastAPI gateway with LangGraph routing - Knowledge Service: ChromaDB RAG with Gitea scraper - LangGraph Service: Multi-agent orchestration - Airflow: Scheduled Gitea ingestion DAG - Documentation: Complete plan and implementation guides Architecture: - Modular Docker Compose per service - External ai-mesh network for communication - Fast rebuilds with /app/packages pattern - Intelligent agent routing (no hardcoded keywords) Services: - Frontend (5173): React chat UI - Chat Gateway (8000): FastAPI entry point - LangGraph (8090): Agent orchestration - Knowledge (8080): ChromaDB RAG - Airflow (8081): Scheduled ingestion - PostgreSQL (5432): Chat history Excludes: node_modules, .venv, chroma_db, logs, .env files Includes: All source code, configs, docs, docker files
2026-02-27 19:51:06 +11:00
commit 628ba96998
44 changed files with 7177 additions and 0 deletions
--- a/knowledge_service/Dockerfile
+++ b/knowledge_service/Dockerfile
@@ -0,0 +1,29 @@
+FROM python:3.11-slim
+
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    libstdc++6 \
+    gcc \
+    g++ \
+    && rm -rf /var/lib/apt/lists/*
+
+# Create directories
+RUN mkdir -p /app/packages /app/code
+
+# Install Python packages to a specific location
+WORKDIR /app
+COPY requirements.txt .
+RUN pip install --target=/app/packages -r requirements.txt
+
+# Copy initial code (will be overridden by volume mount in dev)
+COPY . /app/code/
+
+# Set Python to find packages in /app/packages
+ENV PYTHONPATH=/app/packages
+ENV PYTHONUNBUFFERED=1
+
+WORKDIR /app/code
+EXPOSE 8080
+
+CMD ["python3", "-m", "uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8080"]
+
--- a/knowledge_service/data/hobbies.md
+++ b/knowledge_service/data/hobbies.md
@@ -0,0 +1,15 @@
+# Sam's Hobbies
+
+## Music
+- Enjoys playing guitar and synthesizers.
+- Collects vintage vinyl.
+
+## Gardening
+- Maintains a local vegetable patch.
+- Focuses on organic heirloom tomatoes.
+
+## Skiing
+- Advanced skier, prefers off-piste and backcountry in the Alps.
+
+## Art
+- Digital illustration and oil painting.
--- a/knowledge_service/docker-compose.yml
+++ b/knowledge_service/docker-compose.yml
@@ -0,0 +1,24 @@
+services:
+  knowledge-service:
+    build: .
+    image: sam/knowledge-service:latest
+    container_name: knowledge-service
+    ports:
+      - "8080:8080"
+    volumes:
+      # Only mount the code directory, not packages
+      - ./data:/app/code/data
+      - ./chroma_db:/app/code/chroma_db
+      - ./main.py:/app/code/main.py:ro  # Read-only mount for safety
+    environment:
+      - PYTHONUNBUFFERED=1
+      - OPENROUTER_API_KEY=${OPENROUTER_API_KEY}
+      - PYTHONPATH=/app/packages
+    networks:
+      - ai-mesh
+    restart: unless-stopped
+
+networks:
+  ai-mesh:
+    external: true
+
--- a/knowledge_service/gitea_scraper.py
+++ b/knowledge_service/gitea_scraper.py
@@ -0,0 +1,121 @@
+import os
+import httpx
+import logging
+from typing import List, Dict, Optional
+from dataclasses import dataclass
+from datetime import datetime
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+@dataclass
+class RepoMetadata:
+    name: str
+    description: str
+    url: str
+    default_branch: str
+    updated_at: str
+    language: Optional[str]
+
+class GiteaScraper:
+    def __init__(self, base_url: str, token: str, username: str = "sam"):
+        self.base_url = base_url.rstrip("/")
+        self.token = token
+        self.username = username
+        self.headers = {"Authorization": f"token {token}"}
+        
+    def get_user_repos(self) -> List[RepoMetadata]:
+        """Fetch all repositories for the user."""
+        repos = []
+        page = 1
+        
+        while True:
+            url = f"{self.base_url}/api/v1/users/{self.username}/repos?page={page}&limit=50"
+            
+            try:
+                response = httpx.get(url, headers=self.headers, timeout=30.0)
+                response.raise_for_status()
+                
+                data = response.json()
+                if not data:
+                    break
+                    
+                for repo in data:
+                    repos.append(RepoMetadata(
+                        name=repo["name"],
+                        description=repo.get("description", ""),
+                        url=repo["html_url"],
+                        default_branch=repo["default_branch"],
+                        updated_at=repo["updated_at"],
+                        language=repo.get("language")
+                    ))
+                
+                logger.info(f"Fetched page {page}, got {len(data)} repos")
+                page += 1
+                
+            except Exception as e:
+                logger.error(f"Error fetching repos: {e}")
+                break
+                
+        return repos
+    
+    def get_readme(self, repo_name: str) -> str:
+        """Fetch README content for a repository."""
+        # Try common README filenames
+        readme_names = ["README.md", "readme.md", "Readme.md", "README.rst"]
+        
+        for readme_name in readme_names:
+            url = f"{self.base_url}/api/v1/repos/{self.username}/{repo_name}/raw/{readme_name}"
+            
+            try:
+                response = httpx.get(url, headers=self.headers, timeout=10.0)
+                if response.status_code == 200:
+                    return response.text
+            except Exception as e:
+                logger.warning(f"Failed to fetch {readme_name}: {e}")
+                continue
+                
+        return ""
+    
+    def get_repo_files(self, repo_name: str, path: str = "") -> List[Dict]:
+        """List files in a repository directory."""
+        url = f"{self.base_url}/api/v1/repos/{self.username}/{repo_name}/contents/{path}"
+        
+        try:
+            response = httpx.get(url, headers=self.headers, timeout=10.0)
+            response.raise_for_status()
+            return response.json()
+        except Exception as e:
+            logger.error(f"Error listing files in {repo_name}/{path}: {e}")
+            return []
+    
+    def get_file_content(self, repo_name: str, filepath: str) -> str:
+        """Fetch content of a specific file."""
+        url = f"{self.base_url}/api/v1/repos/{self.username}/{repo_name}/raw/{filepath}"
+        
+        try:
+            response = httpx.get(url, headers=self.headers, timeout=10.0)
+            if response.status_code == 200:
+                return response.text
+        except Exception as e:
+            logger.error(f"Error fetching file {filepath}: {e}")
+            
+        return ""
+
+# Test function
+if __name__ == "__main__":
+    scraper = GiteaScraper(
+        base_url=os.getenv("GITEA_URL", "https://gitea.lab.audasmedia.com.au"),
+        token=os.getenv("GITEA_TOKEN", ""),
+        username=os.getenv("GITEA_USERNAME", "sam")
+    )
+    
+    repos = scraper.get_user_repos()
+    print(f"Found {len(repos)} repositories")
+    
+    for repo in repos[:3]:  # Test with first 3
+        print(f"\nRepo: {repo.name}")
+        readme = scraper.get_readme(repo.name)
+        if readme:
+            print(f"README preview: {readme[:200]}...")
+
--- a/knowledge_service/knowledge_agent_plan.md
+++ b/knowledge_service/knowledge_agent_plan.md
@@ -0,0 +1,56 @@
+# GOAL
+
+Build a \"Deep Knowledge Agent\" (DKA) that acts as a secure,
+quarantined bridge between the Chat Gateway and private data sources.
+
+# ARCHITECTURE OVERVIEW
+
+## Layers
+
+1.  Public Gateway: FastAPI (The \"Voice\").
+2.  Orchestration Layer: LangGraph Supervisor (The \"Router\").
+3.  Quarantined Agent: DKA / Librarian (The \"Keeper of Secrets\").
+    - Strictly Read-Only.
+    - Accesses ChromaDB and Media stores.
+4.  Specialist Agent: Opencode (The \"Engineer\").
+
+## Data Sources (The \"Knowledge Mesh\")
+
+- [ ] **Code**: Gitea (Repos, Markdown docs).
+- [ ] **Notes**: Trilium Next, Obsidian, Flatnotes, HedgeDoc.
+- [ ] **Wiki**: DokuWiki.
+- [ ] **Inventory**: HomeBox (Physical gear, photos).
+- [ ] **Tasks**: Vikunja.
+- [ ] **Media**: Immich (Photos/Videos metadata via Gemini Vision).
+
+## Agent Tooling & Orchestration
+
+- [ ] **Orchestrators**: CAO CLI, Agent Pipe.
+- [ ] **External Agents**: Goose, Aider, Opencode (Specialist).
+
+# COMPONENT DETAILS
+
+## The Librarian (DKA - LangGraph)
+
+- Purpose: Semantic retrieval and data synthesis from vectors.
+- Tools:
+  - `query_chroma`: Search the vector database.
+  - `fetch_media_link`: Returns a signed URL/path for Immich/HomeBox
+    images.
+- Constraints:
+  - NO `bash` or `write` tools.
+
+## The Ingestion Pipeline (Airflow/Custom Python)
+
+- [ ] **Multi-Source Scrapers**: API-based (Gitea, Immich) and
+  File-based (Obsidian).
+- [ ] **Vision Integration**: Gemini analyzes Immich photos to create
+  searchable text descriptions.
+- [ ] **Storage**: ChromaDB (Vectors) + PostgreSQL (Metadata/Hashes).
+
+# [TODO]{.todo .TODO} LIST \[0/4\] {#list-04}
+
+- [ ] Create \'knowledge~service~\' directory.
+- [ ] Implement `test_rag.py` (Hello World retrieval).
+- [ ] Build basic scraper for `hobbies.org`.
+- [ ] Integrate DKA logic into the FastAPI Gateway.
--- a/knowledge_service/knowledge_agent_plan.org
+++ b/knowledge_service/knowledge_agent_plan.org
@@ -0,0 +1,47 @@
+#+TITLE: Phase 3: Knowledge Engine & Agent Orchestration
+#+AUTHOR: Giordano (via opencode)
+#+OPTIONS: toc:2
+
+* GOAL
+Build a "Deep Knowledge Agent" (DKA) that acts as a secure, quarantined bridge between the Chat Gateway and private data sources.
+
+* ARCHITECTURE OVERVIEW
+** Layers
+1. Public Gateway: FastAPI (The "Voice").
+2. Orchestration Layer: LangGraph Supervisor (The "Router").
+3. Quarantined Agent: DKA / Librarian (The "Keeper of Secrets").
+   - Strictly Read-Only.
+   - Accesses ChromaDB and Media stores.
+4. Specialist Agent: Opencode (The "Engineer").
+
+** Data Sources (The "Knowledge Mesh")
+- [ ] *Code*: Gitea (Repos, Markdown docs).
+- [ ] *Notes*: Trilium Next, Obsidian, Flatnotes, HedgeDoc.
+- [ ] *Wiki*: DokuWiki.
+- [ ] *Inventory*: HomeBox (Physical gear, photos).
+- [ ] *Tasks*: Vikunja.
+- [ ] *Media*: Immich (Photos/Videos metadata via Gemini Vision).
+
+** Agent Tooling & Orchestration
+- [ ] *Orchestrators*: CAO CLI, Agent Pipe.
+- [ ] *External Agents*: Goose, Aider, Opencode (Specialist).
+
+* COMPONENT DETAILS
+** The Librarian (DKA - LangGraph)
+- Purpose: Semantic retrieval and data synthesis from vectors.
+- Tools: 
+  - ~query_chroma~: Search the vector database.
+  - ~fetch_media_link~: Returns a signed URL/path for Immich/HomeBox images.
+- Constraints: 
+  - NO ~bash~ or ~write~ tools.
+
+** The Ingestion Pipeline (Airflow/Custom Python)
+- [ ] *Multi-Source Scrapers*: API-based (Gitea, Immich) and File-based (Obsidian).
+- [ ] *Vision Integration*: Gemini analyzes Immich photos to create searchable text descriptions.
+- [ ] *Storage*: ChromaDB (Vectors) + PostgreSQL (Metadata/Hashes).
+
+* TODO LIST [0/4]
+- [ ] Create 'knowledge_service' directory.
+- [ ] Implement ~test_rag.py~ (Hello World retrieval).
+- [ ] Build basic scraper for ~hobbies.org~.
+- [ ] Integrate DKA logic into the FastAPI Gateway.
--- a/knowledge_service/main.py
+++ b/knowledge_service/main.py
@@ -0,0 +1,52 @@
+from fastapi import FastAPI
+from pydantic import BaseModel
+from langchain_community.document_loaders import TextLoader
+from langchain_openai import OpenAIEmbeddings
+from langchain_community.vectorstores import Chroma
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+import os
+import logging
+import sys
+
+logging.basicConfig(level=logging.INFO, stream=sys.stdout)
+logger = logging.getLogger(__name__)
+
+app = FastAPI()
+vector_db = None
+
+# Voyage-2 embeddings via OpenRouter API
+embeddings = OpenAIEmbeddings(
+    model="openai/text-embedding-3-small",
+    openai_api_base="https://openrouter.ai/api/v1",
+    openai_api_key=os.getenv("OPENROUTER_API_KEY")
+)
+
+@app.on_event("startup")
+async def startup_event():
+    global vector_db
+    data_path = "./data/hobbies.md"
+    if os.path.exists(data_path):
+        try:
+            loader = TextLoader(data_path)
+            documents = loader.load()
+            text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
+            chunks = text_splitter.split_documents(documents)
+            vector_db = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory="./chroma_db")
+            logger.info("Librarian: ChromaDB is loaded with openAi embeddings.")
+        except Exception as e:
+            logger.error(f"Librarian: DB error: {str(e)}")
+    else:
+        logger.warning(f"Librarian: Missing data file at {data_path}")
+
+@app.get("/health")
+async def health():
+    return {"status": "ready", "vectors_loaded": vector_db is not None}
+
+class QueryRequest(BaseModel):
+    question: str
+
+@app.post("/query")
+async def query_knowledge(request: QueryRequest):
+    if not vector_db: return {"context": ""}
+    results = vector_db.similarity_search(request.question, k=2)
+    return {"context": "\n".join([res.page_content for res in results])}
--- a/knowledge_service/requirements.txt
+++ b/knowledge_service/requirements.txt
@@ -0,0 +1,7 @@
+fastapi
+uvicorn
+langchain
+langchain-community
+langchain-openai
+langchain-text-splitters
+chromadb