Initial commit: Multi-service AI agent system

- Frontend: Vite + React + TypeScript chat interface - Backend: FastAPI gateway with LangGraph routing - Knowledge Service: ChromaDB RAG with Gitea scraper - LangGraph Service: Multi-agent orchestration - Airflow: Scheduled Gitea ingestion DAG - Documentation: Complete plan and implementation guides Architecture: - Modular Docker Compose per service - External ai-mesh network for communication - Fast rebuilds with /app/packages pattern - Intelligent agent routing (no hardcoded keywords) Services: - Frontend (5173): React chat UI - Chat Gateway (8000): FastAPI entry point - LangGraph (8090): Agent orchestration - Knowledge (8080): ChromaDB RAG - Airflow (8081): Scheduled ingestion - PostgreSQL (5432): Chat history Excludes: node_modules, .venv, chroma_db, logs, .env files Includes: All source code, configs, docs, docker files
2026-02-27 19:51:06 +11:00
commit 628ba96998
44 changed files with 7177 additions and 0 deletions
--- a/airflow/dags/gitea_scraper.py
+++ b/airflow/dags/gitea_scraper.py
@@ -0,0 +1,121 @@
+import os
+import httpx
+import logging
+from typing import List, Dict, Optional
+from dataclasses import dataclass
+from datetime import datetime
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+@dataclass
+class RepoMetadata:
+    name: str
+    description: str
+    url: str
+    default_branch: str
+    updated_at: str
+    language: Optional[str]
+
+class GiteaScraper:
+    def __init__(self, base_url: str, token: str, username: str = "sam"):
+        self.base_url = base_url.rstrip("/")
+        self.token = token
+        self.username = username
+        self.headers = {"Authorization": f"token {token}"}
+        
+    def get_user_repos(self) -> List[RepoMetadata]:
+        """Fetch all repositories for the user."""
+        repos = []
+        page = 1
+        
+        while True:
+            url = f"{self.base_url}/api/v1/users/{self.username}/repos?page={page}&limit=50"
+            
+            try:
+                response = httpx.get(url, headers=self.headers, timeout=30.0)
+                response.raise_for_status()
+                
+                data = response.json()
+                if not data:
+                    break
+                    
+                for repo in data:
+                    repos.append(RepoMetadata(
+                        name=repo["name"],
+                        description=repo.get("description", ""),
+                        url=repo["html_url"],
+                        default_branch=repo["default_branch"],
+                        updated_at=repo["updated_at"],
+                        language=repo.get("language")
+                    ))
+                
+                logger.info(f"Fetched page {page}, got {len(data)} repos")
+                page += 1
+                
+            except Exception as e:
+                logger.error(f"Error fetching repos: {e}")
+                break
+                
+        return repos
+    
+    def get_readme(self, repo_name: str) -> str:
+        """Fetch README content for a repository."""
+        # Try common README filenames
+        readme_names = ["README.md", "readme.md", "Readme.md", "README.rst"]
+        
+        for readme_name in readme_names:
+            url = f"{self.base_url}/api/v1/repos/{self.username}/{repo_name}/raw/{readme_name}"
+            
+            try:
+                response = httpx.get(url, headers=self.headers, timeout=10.0)
+                if response.status_code == 200:
+                    return response.text
+            except Exception as e:
+                logger.warning(f"Failed to fetch {readme_name}: {e}")
+                continue
+                
+        return ""
+    
+    def get_repo_files(self, repo_name: str, path: str = "") -> List[Dict]:
+        """List files in a repository directory."""
+        url = f"{self.base_url}/api/v1/repos/{self.username}/{repo_name}/contents/{path}"
+        
+        try:
+            response = httpx.get(url, headers=self.headers, timeout=10.0)
+            response.raise_for_status()
+            return response.json()
+        except Exception as e:
+            logger.error(f"Error listing files in {repo_name}/{path}: {e}")
+            return []
+    
+    def get_file_content(self, repo_name: str, filepath: str) -> str:
+        """Fetch content of a specific file."""
+        url = f"{self.base_url}/api/v1/repos/{self.username}/{repo_name}/raw/{filepath}"
+        
+        try:
+            response = httpx.get(url, headers=self.headers, timeout=10.0)
+            if response.status_code == 200:
+                return response.text
+        except Exception as e:
+            logger.error(f"Error fetching file {filepath}: {e}")
+            
+        return ""
+
+# Test function
+if __name__ == "__main__":
+    scraper = GiteaScraper(
+        base_url=os.getenv("GITEA_URL", "https://gitea.lab.audasmedia.com.au"),
+        token=os.getenv("GITEA_TOKEN", ""),
+        username=os.getenv("GITEA_USERNAME", "sam")
+    )
+    
+    repos = scraper.get_user_repos()
+    print(f"Found {len(repos)} repositories")
+    
+    for repo in repos[:3]:  # Test with first 3
+        print(f"\nRepo: {repo.name}")
+        readme = scraper.get_readme(repo.name)
+        if readme:
+            print(f"README preview: {readme[:200]}...")
+