Initial commit: Multi-service AI agent system
- Frontend: Vite + React + TypeScript chat interface - Backend: FastAPI gateway with LangGraph routing - Knowledge Service: ChromaDB RAG with Gitea scraper - LangGraph Service: Multi-agent orchestration - Airflow: Scheduled Gitea ingestion DAG - Documentation: Complete plan and implementation guides Architecture: - Modular Docker Compose per service - External ai-mesh network for communication - Fast rebuilds with /app/packages pattern - Intelligent agent routing (no hardcoded keywords) Services: - Frontend (5173): React chat UI - Chat Gateway (8000): FastAPI entry point - LangGraph (8090): Agent orchestration - Knowledge (8080): ChromaDB RAG - Airflow (8081): Scheduled ingestion - PostgreSQL (5432): Chat history Excludes: node_modules, .venv, chroma_db, logs, .env files Includes: All source code, configs, docs, docker files
This commit is contained in:
29
knowledge_service/Dockerfile
Normal file
29
knowledge_service/Dockerfile
Normal file
@@ -0,0 +1,29 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
# Install system dependencies
|
||||
RUN apt-get update && apt-get install -y \
|
||||
libstdc++6 \
|
||||
gcc \
|
||||
g++ \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Create directories
|
||||
RUN mkdir -p /app/packages /app/code
|
||||
|
||||
# Install Python packages to a specific location
|
||||
WORKDIR /app
|
||||
COPY requirements.txt .
|
||||
RUN pip install --target=/app/packages -r requirements.txt
|
||||
|
||||
# Copy initial code (will be overridden by volume mount in dev)
|
||||
COPY . /app/code/
|
||||
|
||||
# Set Python to find packages in /app/packages
|
||||
ENV PYTHONPATH=/app/packages
|
||||
ENV PYTHONUNBUFFERED=1
|
||||
|
||||
WORKDIR /app/code
|
||||
EXPOSE 8080
|
||||
|
||||
CMD ["python3", "-m", "uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8080"]
|
||||
|
||||
15
knowledge_service/data/hobbies.md
Normal file
15
knowledge_service/data/hobbies.md
Normal file
@@ -0,0 +1,15 @@
|
||||
# Sam's Hobbies
|
||||
|
||||
## Music
|
||||
- Enjoys playing guitar and synthesizers.
|
||||
- Collects vintage vinyl.
|
||||
|
||||
## Gardening
|
||||
- Maintains a local vegetable patch.
|
||||
- Focuses on organic heirloom tomatoes.
|
||||
|
||||
## Skiing
|
||||
- Advanced skier, prefers off-piste and backcountry in the Alps.
|
||||
|
||||
## Art
|
||||
- Digital illustration and oil painting.
|
||||
24
knowledge_service/docker-compose.yml
Normal file
24
knowledge_service/docker-compose.yml
Normal file
@@ -0,0 +1,24 @@
|
||||
services:
|
||||
knowledge-service:
|
||||
build: .
|
||||
image: sam/knowledge-service:latest
|
||||
container_name: knowledge-service
|
||||
ports:
|
||||
- "8080:8080"
|
||||
volumes:
|
||||
# Only mount the code directory, not packages
|
||||
- ./data:/app/code/data
|
||||
- ./chroma_db:/app/code/chroma_db
|
||||
- ./main.py:/app/code/main.py:ro # Read-only mount for safety
|
||||
environment:
|
||||
- PYTHONUNBUFFERED=1
|
||||
- OPENROUTER_API_KEY=${OPENROUTER_API_KEY}
|
||||
- PYTHONPATH=/app/packages
|
||||
networks:
|
||||
- ai-mesh
|
||||
restart: unless-stopped
|
||||
|
||||
networks:
|
||||
ai-mesh:
|
||||
external: true
|
||||
|
||||
121
knowledge_service/gitea_scraper.py
Normal file
121
knowledge_service/gitea_scraper.py
Normal file
@@ -0,0 +1,121 @@
|
||||
import os
|
||||
import httpx
|
||||
import logging
|
||||
from typing import List, Dict, Optional
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@dataclass
|
||||
class RepoMetadata:
|
||||
name: str
|
||||
description: str
|
||||
url: str
|
||||
default_branch: str
|
||||
updated_at: str
|
||||
language: Optional[str]
|
||||
|
||||
class GiteaScraper:
|
||||
def __init__(self, base_url: str, token: str, username: str = "sam"):
|
||||
self.base_url = base_url.rstrip("/")
|
||||
self.token = token
|
||||
self.username = username
|
||||
self.headers = {"Authorization": f"token {token}"}
|
||||
|
||||
def get_user_repos(self) -> List[RepoMetadata]:
|
||||
"""Fetch all repositories for the user."""
|
||||
repos = []
|
||||
page = 1
|
||||
|
||||
while True:
|
||||
url = f"{self.base_url}/api/v1/users/{self.username}/repos?page={page}&limit=50"
|
||||
|
||||
try:
|
||||
response = httpx.get(url, headers=self.headers, timeout=30.0)
|
||||
response.raise_for_status()
|
||||
|
||||
data = response.json()
|
||||
if not data:
|
||||
break
|
||||
|
||||
for repo in data:
|
||||
repos.append(RepoMetadata(
|
||||
name=repo["name"],
|
||||
description=repo.get("description", ""),
|
||||
url=repo["html_url"],
|
||||
default_branch=repo["default_branch"],
|
||||
updated_at=repo["updated_at"],
|
||||
language=repo.get("language")
|
||||
))
|
||||
|
||||
logger.info(f"Fetched page {page}, got {len(data)} repos")
|
||||
page += 1
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error fetching repos: {e}")
|
||||
break
|
||||
|
||||
return repos
|
||||
|
||||
def get_readme(self, repo_name: str) -> str:
|
||||
"""Fetch README content for a repository."""
|
||||
# Try common README filenames
|
||||
readme_names = ["README.md", "readme.md", "Readme.md", "README.rst"]
|
||||
|
||||
for readme_name in readme_names:
|
||||
url = f"{self.base_url}/api/v1/repos/{self.username}/{repo_name}/raw/{readme_name}"
|
||||
|
||||
try:
|
||||
response = httpx.get(url, headers=self.headers, timeout=10.0)
|
||||
if response.status_code == 200:
|
||||
return response.text
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to fetch {readme_name}: {e}")
|
||||
continue
|
||||
|
||||
return ""
|
||||
|
||||
def get_repo_files(self, repo_name: str, path: str = "") -> List[Dict]:
|
||||
"""List files in a repository directory."""
|
||||
url = f"{self.base_url}/api/v1/repos/{self.username}/{repo_name}/contents/{path}"
|
||||
|
||||
try:
|
||||
response = httpx.get(url, headers=self.headers, timeout=10.0)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
except Exception as e:
|
||||
logger.error(f"Error listing files in {repo_name}/{path}: {e}")
|
||||
return []
|
||||
|
||||
def get_file_content(self, repo_name: str, filepath: str) -> str:
|
||||
"""Fetch content of a specific file."""
|
||||
url = f"{self.base_url}/api/v1/repos/{self.username}/{repo_name}/raw/{filepath}"
|
||||
|
||||
try:
|
||||
response = httpx.get(url, headers=self.headers, timeout=10.0)
|
||||
if response.status_code == 200:
|
||||
return response.text
|
||||
except Exception as e:
|
||||
logger.error(f"Error fetching file {filepath}: {e}")
|
||||
|
||||
return ""
|
||||
|
||||
# Test function
|
||||
if __name__ == "__main__":
|
||||
scraper = GiteaScraper(
|
||||
base_url=os.getenv("GITEA_URL", "https://gitea.lab.audasmedia.com.au"),
|
||||
token=os.getenv("GITEA_TOKEN", ""),
|
||||
username=os.getenv("GITEA_USERNAME", "sam")
|
||||
)
|
||||
|
||||
repos = scraper.get_user_repos()
|
||||
print(f"Found {len(repos)} repositories")
|
||||
|
||||
for repo in repos[:3]: # Test with first 3
|
||||
print(f"\nRepo: {repo.name}")
|
||||
readme = scraper.get_readme(repo.name)
|
||||
if readme:
|
||||
print(f"README preview: {readme[:200]}...")
|
||||
|
||||
56
knowledge_service/knowledge_agent_plan.md
Normal file
56
knowledge_service/knowledge_agent_plan.md
Normal file
@@ -0,0 +1,56 @@
|
||||
# GOAL
|
||||
|
||||
Build a \"Deep Knowledge Agent\" (DKA) that acts as a secure,
|
||||
quarantined bridge between the Chat Gateway and private data sources.
|
||||
|
||||
# ARCHITECTURE OVERVIEW
|
||||
|
||||
## Layers
|
||||
|
||||
1. Public Gateway: FastAPI (The \"Voice\").
|
||||
2. Orchestration Layer: LangGraph Supervisor (The \"Router\").
|
||||
3. Quarantined Agent: DKA / Librarian (The \"Keeper of Secrets\").
|
||||
- Strictly Read-Only.
|
||||
- Accesses ChromaDB and Media stores.
|
||||
4. Specialist Agent: Opencode (The \"Engineer\").
|
||||
|
||||
## Data Sources (The \"Knowledge Mesh\")
|
||||
|
||||
- [ ] **Code**: Gitea (Repos, Markdown docs).
|
||||
- [ ] **Notes**: Trilium Next, Obsidian, Flatnotes, HedgeDoc.
|
||||
- [ ] **Wiki**: DokuWiki.
|
||||
- [ ] **Inventory**: HomeBox (Physical gear, photos).
|
||||
- [ ] **Tasks**: Vikunja.
|
||||
- [ ] **Media**: Immich (Photos/Videos metadata via Gemini Vision).
|
||||
|
||||
## Agent Tooling & Orchestration
|
||||
|
||||
- [ ] **Orchestrators**: CAO CLI, Agent Pipe.
|
||||
- [ ] **External Agents**: Goose, Aider, Opencode (Specialist).
|
||||
|
||||
# COMPONENT DETAILS
|
||||
|
||||
## The Librarian (DKA - LangGraph)
|
||||
|
||||
- Purpose: Semantic retrieval and data synthesis from vectors.
|
||||
- Tools:
|
||||
- `query_chroma`: Search the vector database.
|
||||
- `fetch_media_link`: Returns a signed URL/path for Immich/HomeBox
|
||||
images.
|
||||
- Constraints:
|
||||
- NO `bash` or `write` tools.
|
||||
|
||||
## The Ingestion Pipeline (Airflow/Custom Python)
|
||||
|
||||
- [ ] **Multi-Source Scrapers**: API-based (Gitea, Immich) and
|
||||
File-based (Obsidian).
|
||||
- [ ] **Vision Integration**: Gemini analyzes Immich photos to create
|
||||
searchable text descriptions.
|
||||
- [ ] **Storage**: ChromaDB (Vectors) + PostgreSQL (Metadata/Hashes).
|
||||
|
||||
# [TODO]{.todo .TODO} LIST \[0/4\] {#list-04}
|
||||
|
||||
- [ ] Create \'knowledge~service~\' directory.
|
||||
- [ ] Implement `test_rag.py` (Hello World retrieval).
|
||||
- [ ] Build basic scraper for `hobbies.org`.
|
||||
- [ ] Integrate DKA logic into the FastAPI Gateway.
|
||||
47
knowledge_service/knowledge_agent_plan.org
Normal file
47
knowledge_service/knowledge_agent_plan.org
Normal file
@@ -0,0 +1,47 @@
|
||||
#+TITLE: Phase 3: Knowledge Engine & Agent Orchestration
|
||||
#+AUTHOR: Giordano (via opencode)
|
||||
#+OPTIONS: toc:2
|
||||
|
||||
* GOAL
|
||||
Build a "Deep Knowledge Agent" (DKA) that acts as a secure, quarantined bridge between the Chat Gateway and private data sources.
|
||||
|
||||
* ARCHITECTURE OVERVIEW
|
||||
** Layers
|
||||
1. Public Gateway: FastAPI (The "Voice").
|
||||
2. Orchestration Layer: LangGraph Supervisor (The "Router").
|
||||
3. Quarantined Agent: DKA / Librarian (The "Keeper of Secrets").
|
||||
- Strictly Read-Only.
|
||||
- Accesses ChromaDB and Media stores.
|
||||
4. Specialist Agent: Opencode (The "Engineer").
|
||||
|
||||
** Data Sources (The "Knowledge Mesh")
|
||||
- [ ] *Code*: Gitea (Repos, Markdown docs).
|
||||
- [ ] *Notes*: Trilium Next, Obsidian, Flatnotes, HedgeDoc.
|
||||
- [ ] *Wiki*: DokuWiki.
|
||||
- [ ] *Inventory*: HomeBox (Physical gear, photos).
|
||||
- [ ] *Tasks*: Vikunja.
|
||||
- [ ] *Media*: Immich (Photos/Videos metadata via Gemini Vision).
|
||||
|
||||
** Agent Tooling & Orchestration
|
||||
- [ ] *Orchestrators*: CAO CLI, Agent Pipe.
|
||||
- [ ] *External Agents*: Goose, Aider, Opencode (Specialist).
|
||||
|
||||
* COMPONENT DETAILS
|
||||
** The Librarian (DKA - LangGraph)
|
||||
- Purpose: Semantic retrieval and data synthesis from vectors.
|
||||
- Tools:
|
||||
- ~query_chroma~: Search the vector database.
|
||||
- ~fetch_media_link~: Returns a signed URL/path for Immich/HomeBox images.
|
||||
- Constraints:
|
||||
- NO ~bash~ or ~write~ tools.
|
||||
|
||||
** The Ingestion Pipeline (Airflow/Custom Python)
|
||||
- [ ] *Multi-Source Scrapers*: API-based (Gitea, Immich) and File-based (Obsidian).
|
||||
- [ ] *Vision Integration*: Gemini analyzes Immich photos to create searchable text descriptions.
|
||||
- [ ] *Storage*: ChromaDB (Vectors) + PostgreSQL (Metadata/Hashes).
|
||||
|
||||
* TODO LIST [0/4]
|
||||
- [ ] Create 'knowledge_service' directory.
|
||||
- [ ] Implement ~test_rag.py~ (Hello World retrieval).
|
||||
- [ ] Build basic scraper for ~hobbies.org~.
|
||||
- [ ] Integrate DKA logic into the FastAPI Gateway.
|
||||
52
knowledge_service/main.py
Normal file
52
knowledge_service/main.py
Normal file
@@ -0,0 +1,52 @@
|
||||
from fastapi import FastAPI
|
||||
from pydantic import BaseModel
|
||||
from langchain_community.document_loaders import TextLoader
|
||||
from langchain_openai import OpenAIEmbeddings
|
||||
from langchain_community.vectorstores import Chroma
|
||||
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
||||
import os
|
||||
import logging
|
||||
import sys
|
||||
|
||||
logging.basicConfig(level=logging.INFO, stream=sys.stdout)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
app = FastAPI()
|
||||
vector_db = None
|
||||
|
||||
# Voyage-2 embeddings via OpenRouter API
|
||||
embeddings = OpenAIEmbeddings(
|
||||
model="openai/text-embedding-3-small",
|
||||
openai_api_base="https://openrouter.ai/api/v1",
|
||||
openai_api_key=os.getenv("OPENROUTER_API_KEY")
|
||||
)
|
||||
|
||||
@app.on_event("startup")
|
||||
async def startup_event():
|
||||
global vector_db
|
||||
data_path = "./data/hobbies.md"
|
||||
if os.path.exists(data_path):
|
||||
try:
|
||||
loader = TextLoader(data_path)
|
||||
documents = loader.load()
|
||||
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
|
||||
chunks = text_splitter.split_documents(documents)
|
||||
vector_db = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory="./chroma_db")
|
||||
logger.info("Librarian: ChromaDB is loaded with openAi embeddings.")
|
||||
except Exception as e:
|
||||
logger.error(f"Librarian: DB error: {str(e)}")
|
||||
else:
|
||||
logger.warning(f"Librarian: Missing data file at {data_path}")
|
||||
|
||||
@app.get("/health")
|
||||
async def health():
|
||||
return {"status": "ready", "vectors_loaded": vector_db is not None}
|
||||
|
||||
class QueryRequest(BaseModel):
|
||||
question: str
|
||||
|
||||
@app.post("/query")
|
||||
async def query_knowledge(request: QueryRequest):
|
||||
if not vector_db: return {"context": ""}
|
||||
results = vector_db.similarity_search(request.question, k=2)
|
||||
return {"context": "\n".join([res.page_content for res in results])}
|
||||
7
knowledge_service/requirements.txt
Normal file
7
knowledge_service/requirements.txt
Normal file
@@ -0,0 +1,7 @@
|
||||
fastapi
|
||||
uvicorn
|
||||
langchain
|
||||
langchain-community
|
||||
langchain-openai
|
||||
langchain-text-splitters
|
||||
chromadb
|
||||
Reference in New Issue
Block a user