Initial commit: Multi-service AI agent system

- Frontend: Vite + React + TypeScript chat interface
- Backend: FastAPI gateway with LangGraph routing
- Knowledge Service: ChromaDB RAG with Gitea scraper
- LangGraph Service: Multi-agent orchestration
- Airflow: Scheduled Gitea ingestion DAG
- Documentation: Complete plan and implementation guides

Architecture:
- Modular Docker Compose per service
- External ai-mesh network for communication
- Fast rebuilds with /app/packages pattern
- Intelligent agent routing (no hardcoded keywords)

Services:
- Frontend (5173): React chat UI
- Chat Gateway (8000): FastAPI entry point
- LangGraph (8090): Agent orchestration
- Knowledge (8080): ChromaDB RAG
- Airflow (8081): Scheduled ingestion
- PostgreSQL (5432): Chat history

Excludes: node_modules, .venv, chroma_db, logs, .env files
Includes: All source code, configs, docs, docker files
This commit is contained in:
2026-02-27 19:51:06 +11:00
commit 628ba96998
44 changed files with 7177 additions and 0 deletions

View File

@@ -0,0 +1,29 @@
FROM python:3.11-slim
# Install system dependencies
RUN apt-get update && apt-get install -y \
libstdc++6 \
gcc \
g++ \
&& rm -rf /var/lib/apt/lists/*
# Create directories
RUN mkdir -p /app/packages /app/code
# Install Python packages to a specific location
WORKDIR /app
COPY requirements.txt .
RUN pip install --target=/app/packages -r requirements.txt
# Copy initial code (will be overridden by volume mount in dev)
COPY . /app/code/
# Set Python to find packages in /app/packages
ENV PYTHONPATH=/app/packages
ENV PYTHONUNBUFFERED=1
WORKDIR /app/code
EXPOSE 8080
CMD ["python3", "-m", "uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8080"]

View File

@@ -0,0 +1,15 @@
# Sam's Hobbies
## Music
- Enjoys playing guitar and synthesizers.
- Collects vintage vinyl.
## Gardening
- Maintains a local vegetable patch.
- Focuses on organic heirloom tomatoes.
## Skiing
- Advanced skier, prefers off-piste and backcountry in the Alps.
## Art
- Digital illustration and oil painting.

View File

@@ -0,0 +1,24 @@
services:
knowledge-service:
build: .
image: sam/knowledge-service:latest
container_name: knowledge-service
ports:
- "8080:8080"
volumes:
# Only mount the code directory, not packages
- ./data:/app/code/data
- ./chroma_db:/app/code/chroma_db
- ./main.py:/app/code/main.py:ro # Read-only mount for safety
environment:
- PYTHONUNBUFFERED=1
- OPENROUTER_API_KEY=${OPENROUTER_API_KEY}
- PYTHONPATH=/app/packages
networks:
- ai-mesh
restart: unless-stopped
networks:
ai-mesh:
external: true

View File

@@ -0,0 +1,121 @@
import os
import httpx
import logging
from typing import List, Dict, Optional
from dataclasses import dataclass
from datetime import datetime
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
@dataclass
class RepoMetadata:
name: str
description: str
url: str
default_branch: str
updated_at: str
language: Optional[str]
class GiteaScraper:
def __init__(self, base_url: str, token: str, username: str = "sam"):
self.base_url = base_url.rstrip("/")
self.token = token
self.username = username
self.headers = {"Authorization": f"token {token}"}
def get_user_repos(self) -> List[RepoMetadata]:
"""Fetch all repositories for the user."""
repos = []
page = 1
while True:
url = f"{self.base_url}/api/v1/users/{self.username}/repos?page={page}&limit=50"
try:
response = httpx.get(url, headers=self.headers, timeout=30.0)
response.raise_for_status()
data = response.json()
if not data:
break
for repo in data:
repos.append(RepoMetadata(
name=repo["name"],
description=repo.get("description", ""),
url=repo["html_url"],
default_branch=repo["default_branch"],
updated_at=repo["updated_at"],
language=repo.get("language")
))
logger.info(f"Fetched page {page}, got {len(data)} repos")
page += 1
except Exception as e:
logger.error(f"Error fetching repos: {e}")
break
return repos
def get_readme(self, repo_name: str) -> str:
"""Fetch README content for a repository."""
# Try common README filenames
readme_names = ["README.md", "readme.md", "Readme.md", "README.rst"]
for readme_name in readme_names:
url = f"{self.base_url}/api/v1/repos/{self.username}/{repo_name}/raw/{readme_name}"
try:
response = httpx.get(url, headers=self.headers, timeout=10.0)
if response.status_code == 200:
return response.text
except Exception as e:
logger.warning(f"Failed to fetch {readme_name}: {e}")
continue
return ""
def get_repo_files(self, repo_name: str, path: str = "") -> List[Dict]:
"""List files in a repository directory."""
url = f"{self.base_url}/api/v1/repos/{self.username}/{repo_name}/contents/{path}"
try:
response = httpx.get(url, headers=self.headers, timeout=10.0)
response.raise_for_status()
return response.json()
except Exception as e:
logger.error(f"Error listing files in {repo_name}/{path}: {e}")
return []
def get_file_content(self, repo_name: str, filepath: str) -> str:
"""Fetch content of a specific file."""
url = f"{self.base_url}/api/v1/repos/{self.username}/{repo_name}/raw/{filepath}"
try:
response = httpx.get(url, headers=self.headers, timeout=10.0)
if response.status_code == 200:
return response.text
except Exception as e:
logger.error(f"Error fetching file {filepath}: {e}")
return ""
# Test function
if __name__ == "__main__":
scraper = GiteaScraper(
base_url=os.getenv("GITEA_URL", "https://gitea.lab.audasmedia.com.au"),
token=os.getenv("GITEA_TOKEN", ""),
username=os.getenv("GITEA_USERNAME", "sam")
)
repos = scraper.get_user_repos()
print(f"Found {len(repos)} repositories")
for repo in repos[:3]: # Test with first 3
print(f"\nRepo: {repo.name}")
readme = scraper.get_readme(repo.name)
if readme:
print(f"README preview: {readme[:200]}...")

View File

@@ -0,0 +1,56 @@
# GOAL
Build a \"Deep Knowledge Agent\" (DKA) that acts as a secure,
quarantined bridge between the Chat Gateway and private data sources.
# ARCHITECTURE OVERVIEW
## Layers
1. Public Gateway: FastAPI (The \"Voice\").
2. Orchestration Layer: LangGraph Supervisor (The \"Router\").
3. Quarantined Agent: DKA / Librarian (The \"Keeper of Secrets\").
- Strictly Read-Only.
- Accesses ChromaDB and Media stores.
4. Specialist Agent: Opencode (The \"Engineer\").
## Data Sources (The \"Knowledge Mesh\")
- [ ] **Code**: Gitea (Repos, Markdown docs).
- [ ] **Notes**: Trilium Next, Obsidian, Flatnotes, HedgeDoc.
- [ ] **Wiki**: DokuWiki.
- [ ] **Inventory**: HomeBox (Physical gear, photos).
- [ ] **Tasks**: Vikunja.
- [ ] **Media**: Immich (Photos/Videos metadata via Gemini Vision).
## Agent Tooling & Orchestration
- [ ] **Orchestrators**: CAO CLI, Agent Pipe.
- [ ] **External Agents**: Goose, Aider, Opencode (Specialist).
# COMPONENT DETAILS
## The Librarian (DKA - LangGraph)
- Purpose: Semantic retrieval and data synthesis from vectors.
- Tools:
- `query_chroma`: Search the vector database.
- `fetch_media_link`: Returns a signed URL/path for Immich/HomeBox
images.
- Constraints:
- NO `bash` or `write` tools.
## The Ingestion Pipeline (Airflow/Custom Python)
- [ ] **Multi-Source Scrapers**: API-based (Gitea, Immich) and
File-based (Obsidian).
- [ ] **Vision Integration**: Gemini analyzes Immich photos to create
searchable text descriptions.
- [ ] **Storage**: ChromaDB (Vectors) + PostgreSQL (Metadata/Hashes).
# [TODO]{.todo .TODO} LIST \[0/4\] {#list-04}
- [ ] Create \'knowledge~service~\' directory.
- [ ] Implement `test_rag.py` (Hello World retrieval).
- [ ] Build basic scraper for `hobbies.org`.
- [ ] Integrate DKA logic into the FastAPI Gateway.

View File

@@ -0,0 +1,47 @@
#+TITLE: Phase 3: Knowledge Engine & Agent Orchestration
#+AUTHOR: Giordano (via opencode)
#+OPTIONS: toc:2
* GOAL
Build a "Deep Knowledge Agent" (DKA) that acts as a secure, quarantined bridge between the Chat Gateway and private data sources.
* ARCHITECTURE OVERVIEW
** Layers
1. Public Gateway: FastAPI (The "Voice").
2. Orchestration Layer: LangGraph Supervisor (The "Router").
3. Quarantined Agent: DKA / Librarian (The "Keeper of Secrets").
- Strictly Read-Only.
- Accesses ChromaDB and Media stores.
4. Specialist Agent: Opencode (The "Engineer").
** Data Sources (The "Knowledge Mesh")
- [ ] *Code*: Gitea (Repos, Markdown docs).
- [ ] *Notes*: Trilium Next, Obsidian, Flatnotes, HedgeDoc.
- [ ] *Wiki*: DokuWiki.
- [ ] *Inventory*: HomeBox (Physical gear, photos).
- [ ] *Tasks*: Vikunja.
- [ ] *Media*: Immich (Photos/Videos metadata via Gemini Vision).
** Agent Tooling & Orchestration
- [ ] *Orchestrators*: CAO CLI, Agent Pipe.
- [ ] *External Agents*: Goose, Aider, Opencode (Specialist).
* COMPONENT DETAILS
** The Librarian (DKA - LangGraph)
- Purpose: Semantic retrieval and data synthesis from vectors.
- Tools:
- ~query_chroma~: Search the vector database.
- ~fetch_media_link~: Returns a signed URL/path for Immich/HomeBox images.
- Constraints:
- NO ~bash~ or ~write~ tools.
** The Ingestion Pipeline (Airflow/Custom Python)
- [ ] *Multi-Source Scrapers*: API-based (Gitea, Immich) and File-based (Obsidian).
- [ ] *Vision Integration*: Gemini analyzes Immich photos to create searchable text descriptions.
- [ ] *Storage*: ChromaDB (Vectors) + PostgreSQL (Metadata/Hashes).
* TODO LIST [0/4]
- [ ] Create 'knowledge_service' directory.
- [ ] Implement ~test_rag.py~ (Hello World retrieval).
- [ ] Build basic scraper for ~hobbies.org~.
- [ ] Integrate DKA logic into the FastAPI Gateway.

52
knowledge_service/main.py Normal file
View File

@@ -0,0 +1,52 @@
from fastapi import FastAPI
from pydantic import BaseModel
from langchain_community.document_loaders import TextLoader
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os
import logging
import sys
logging.basicConfig(level=logging.INFO, stream=sys.stdout)
logger = logging.getLogger(__name__)
app = FastAPI()
vector_db = None
# Voyage-2 embeddings via OpenRouter API
embeddings = OpenAIEmbeddings(
model="openai/text-embedding-3-small",
openai_api_base="https://openrouter.ai/api/v1",
openai_api_key=os.getenv("OPENROUTER_API_KEY")
)
@app.on_event("startup")
async def startup_event():
global vector_db
data_path = "./data/hobbies.md"
if os.path.exists(data_path):
try:
loader = TextLoader(data_path)
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = text_splitter.split_documents(documents)
vector_db = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory="./chroma_db")
logger.info("Librarian: ChromaDB is loaded with openAi embeddings.")
except Exception as e:
logger.error(f"Librarian: DB error: {str(e)}")
else:
logger.warning(f"Librarian: Missing data file at {data_path}")
@app.get("/health")
async def health():
return {"status": "ready", "vectors_loaded": vector_db is not None}
class QueryRequest(BaseModel):
question: str
@app.post("/query")
async def query_knowledge(request: QueryRequest):
if not vector_db: return {"context": ""}
results = vector_db.similarity_search(request.question, k=2)
return {"context": "\n".join([res.page_content for res in results])}

View File

@@ -0,0 +1,7 @@
fastapi
uvicorn
langchain
langchain-community
langchain-openai
langchain-text-splitters
chromadb