Initial commit: Multi-service AI agent system

- Frontend: Vite + React + TypeScript chat interface - Backend: FastAPI gateway with LangGraph routing - Knowledge Service: ChromaDB RAG with Gitea scraper - LangGraph Service: Multi-agent orchestration - Airflow: Scheduled Gitea ingestion DAG - Documentation: Complete plan and implementation guides Architecture: - Modular Docker Compose per service - External ai-mesh network for communication - Fast rebuilds with /app/packages pattern - Intelligent agent routing (no hardcoded keywords) Services: - Frontend (5173): React chat UI - Chat Gateway (8000): FastAPI entry point - LangGraph (8090): Agent orchestration - Knowledge (8080): ChromaDB RAG - Airflow (8081): Scheduled ingestion - PostgreSQL (5432): Chat history Excludes: node_modules, .venv, chroma_db, logs, .env files Includes: All source code, configs, docs, docker files
2026-02-27 19:51:06 +11:00
commit 628ba96998
44 changed files with 7177 additions and 0 deletions
--- a/airflow/dags/gitea_ingestion_dag.py
+++ b/airflow/dags/gitea_ingestion_dag.py
@@ -0,0 +1,144 @@
+"""
+Airflow DAG for scheduled Gitea repository ingestion.
+Runs daily to fetch new/updated repos and ingest into ChromaDB.
+"""
+from datetime import datetime, timedelta
+from airflow import DAG
+from airflow.operators.python import PythonOperator
+from airflow.providers.http.operators.http import SimpleHttpOperator
+import os
+import sys
+import json
+
+# Add knowledge_service to path for imports
+sys.path.insert(0, '/opt/airflow/dags/repo')
+
+default_args = {
+    'owner': 'airflow',
+    'depends_on_past': False,
+    'email_on_failure': False,
+    'email_on_retry': False,
+    'retries': 1,
+    'retry_delay': timedelta(minutes=5),
+}
+
+def fetch_gitea_repos(**context):
+    """Task: Fetch all repositories from Gitea."""
+    from gitea_scraper import GiteaScraper
+    
+    scraper = GiteaScraper(
+        base_url=os.getenv("GITEA_URL", "https://gitea.lab.audasmedia.com.au"),
+        token=os.getenv("GITEA_TOKEN", ""),
+        username=os.getenv("GITEA_USERNAME", "sam")
+    )
+    
+    repos = scraper.get_user_repos()
+    
+    # Push to XCom for downstream tasks
+    context['ti'].xcom_push(key='repo_count', value=len(repos))
+    context['ti'].xcom_push(key='repos', value=[
+        {
+            'name': r.name,
+            'description': r.description,
+            'url': r.url,
+            'updated_at': r.updated_at
+        }
+        for r in repos
+    ])
+    
+    return f"Fetched {len(repos)} repositories"
+
+def fetch_readmes(**context):
+    """Task: Fetch READMEs for all repositories."""
+    from gitea_scraper import GiteaScraper
+    
+    ti = context['ti']
+    repos = ti.xcom_pull(task_ids='fetch_repos', key='repos')
+    
+    scraper = GiteaScraper(
+        base_url=os.getenv("GITEA_URL", "https://gitea.lab.audasmedia.com.au"),
+        token=os.getenv("GITEA_TOKEN", ""),
+        username=os.getenv("GITEA_USERNAME", "sam")
+    )
+    
+    readme_data = []
+    for repo in repos[:10]:  # Limit to 10 repos per run for testing
+        readme = scraper.get_readme(repo['name'])
+        if readme:
+            readme_data.append({
+                'repo': repo['name'],
+                'content': readme[:5000],  # First 5000 chars
+                'url': repo['url']
+            })
+    
+    ti.xcom_push(key='readme_data', value=readme_data)
+    
+    return f"Fetched {len(readme_data)} READMEs"
+
+def ingest_to_chroma(**context):
+    """Task: Ingest fetched data into ChromaDB via knowledge service."""
+    import httpx
+    
+    ti = context['ti']
+    readme_data = ti.xcom_pull(task_ids='fetch_readmes', key='readme_data')
+    
+    knowledge_service_url = os.getenv("KNOWLEDGE_SERVICE_URL", "http://knowledge-service:8080")
+    
+    documents_ingested = 0
+    for item in readme_data:
+        try:
+            # Call knowledge service ingest endpoint
+            response = httpx.post(
+                f"{knowledge_service_url}/ingest",
+                json={
+                    'source': f"gitea:{item['repo']}",
+                    'content': item['content'],
+                    'metadata': {
+                        'repo': item['repo'],
+                        'url': item['url'],
+                        'type': 'readme'
+                    }
+                },
+                timeout=30.0
+            )
+            
+            if response.status_code == 200:
+                documents_ingested += 1
+                
+        except Exception as e:
+            print(f"Error ingesting {item['repo']}: {e}")
+    
+    return f"Ingested {documents_ingested} documents into ChromaDB"
+
+# Define the DAG
+with DAG(
+    'gitea_daily_ingestion',
+    default_args=default_args,
+    description='Daily ingestion of Gitea repositories into knowledge base',
+    schedule_interval=timedelta(days=1),  # Run daily
+    start_date=datetime(2024, 1, 1),
+    catchup=False,
+    tags=['gitea', 'ingestion', 'knowledge'],
+) as dag:
+    
+    # Task 1: Fetch repository list
+    fetch_repos_task = PythonOperator(
+        task_id='fetch_repos',
+        python_callable=fetch_gitea_repos,
+    )
+    
+    # Task 2: Fetch README content
+    fetch_readmes_task = PythonOperator(
+        task_id='fetch_readmes',
+        python_callable=fetch_readmes,
+    )
+    
+    # Task 3: Ingest into ChromaDB
+    ingest_task = PythonOperator(
+        task_id='ingest_to_chroma',
+        python_callable=ingest_to_chroma,
+    )
+    
+    # Define task dependencies
+    fetch_repos_task >> fetch_readmes_task >> ingest_task
+
--- a/airflow/dags/gitea_scraper.py
+++ b/airflow/dags/gitea_scraper.py
@@ -0,0 +1,121 @@
+import os
+import httpx
+import logging
+from typing import List, Dict, Optional
+from dataclasses import dataclass
+from datetime import datetime
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+@dataclass
+class RepoMetadata:
+    name: str
+    description: str
+    url: str
+    default_branch: str
+    updated_at: str
+    language: Optional[str]
+
+class GiteaScraper:
+    def __init__(self, base_url: str, token: str, username: str = "sam"):
+        self.base_url = base_url.rstrip("/")
+        self.token = token
+        self.username = username
+        self.headers = {"Authorization": f"token {token}"}
+        
+    def get_user_repos(self) -> List[RepoMetadata]:
+        """Fetch all repositories for the user."""
+        repos = []
+        page = 1
+        
+        while True:
+            url = f"{self.base_url}/api/v1/users/{self.username}/repos?page={page}&limit=50"
+            
+            try:
+                response = httpx.get(url, headers=self.headers, timeout=30.0)
+                response.raise_for_status()
+                
+                data = response.json()
+                if not data:
+                    break
+                    
+                for repo in data:
+                    repos.append(RepoMetadata(
+                        name=repo["name"],
+                        description=repo.get("description", ""),
+                        url=repo["html_url"],
+                        default_branch=repo["default_branch"],
+                        updated_at=repo["updated_at"],
+                        language=repo.get("language")
+                    ))
+                
+                logger.info(f"Fetched page {page}, got {len(data)} repos")
+                page += 1
+                
+            except Exception as e:
+                logger.error(f"Error fetching repos: {e}")
+                break
+                
+        return repos
+    
+    def get_readme(self, repo_name: str) -> str:
+        """Fetch README content for a repository."""
+        # Try common README filenames
+        readme_names = ["README.md", "readme.md", "Readme.md", "README.rst"]
+        
+        for readme_name in readme_names:
+            url = f"{self.base_url}/api/v1/repos/{self.username}/{repo_name}/raw/{readme_name}"
+            
+            try:
+                response = httpx.get(url, headers=self.headers, timeout=10.0)
+                if response.status_code == 200:
+                    return response.text
+            except Exception as e:
+                logger.warning(f"Failed to fetch {readme_name}: {e}")
+                continue
+                
+        return ""
+    
+    def get_repo_files(self, repo_name: str, path: str = "") -> List[Dict]:
+        """List files in a repository directory."""
+        url = f"{self.base_url}/api/v1/repos/{self.username}/{repo_name}/contents/{path}"
+        
+        try:
+            response = httpx.get(url, headers=self.headers, timeout=10.0)
+            response.raise_for_status()
+            return response.json()
+        except Exception as e:
+            logger.error(f"Error listing files in {repo_name}/{path}: {e}")
+            return []
+    
+    def get_file_content(self, repo_name: str, filepath: str) -> str:
+        """Fetch content of a specific file."""
+        url = f"{self.base_url}/api/v1/repos/{self.username}/{repo_name}/raw/{filepath}"
+        
+        try:
+            response = httpx.get(url, headers=self.headers, timeout=10.0)
+            if response.status_code == 200:
+                return response.text
+        except Exception as e:
+            logger.error(f"Error fetching file {filepath}: {e}")
+            
+        return ""
+
+# Test function
+if __name__ == "__main__":
+    scraper = GiteaScraper(
+        base_url=os.getenv("GITEA_URL", "https://gitea.lab.audasmedia.com.au"),
+        token=os.getenv("GITEA_TOKEN", ""),
+        username=os.getenv("GITEA_USERNAME", "sam")
+    )
+    
+    repos = scraper.get_user_repos()
+    print(f"Found {len(repos)} repositories")
+    
+    for repo in repos[:3]:  # Test with first 3
+        print(f"\nRepo: {repo.name}")
+        readme = scraper.get_readme(repo.name)
+        if readme:
+            print(f"README preview: {readme[:200]}...")
+
--- a/airflow/docker-compose.yml
+++ b/airflow/docker-compose.yml
@@ -0,0 +1,181 @@
+version: '3.8'
+
+x-airflow-common:
+  &airflow-common
+  image: ${AIRFLOW_IMAGE_NAME:-apache/airflow:2.8.1}
+  environment:
+    &airflow-common-env
+    AIRFLOW__CORE__EXECUTOR: CeleryExecutor
+    AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow
+    AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@postgres/airflow
+    AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0
+    AIRFLOW__CORE__FERNET_KEY: ''
+    AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true'
+    AIRFLOW__CORE__LOAD_EXAMPLES: 'false'
+    AIRFLOW__API__AUTH_BACKENDS: 'airflow.api.auth.backend.basic_auth,airflow.api.auth.backend.session'
+    AIRFLOW__SCHEDULER__ENABLE_HEALTH_CHECK: 'true'
+    _PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:-}
+  volumes:
+    - ${AIRFLOW_PROJ_DIR:-.}/dags:/opt/airflow/dags
+    - ${AIRFLOW_PROJ_DIR:-.}/logs:/opt/airflow/logs
+    - ${AIRFLOW_PROJ_DIR:-.}/config:/opt/airflow/config
+    - ${AIRFLOW_PROJ_DIR:-.}/plugins:/opt/airflow/plugins
+  user: "${AIRFLOW_UID:-50000}:0"
+  depends_on:
+    &airflow-common-depends-on
+    redis:
+      condition: service_healthy
+    postgres:
+      condition: service_healthy
+
+services:
+  postgres:
+    image: postgres:13
+    environment:
+      POSTGRES_USER: airflow
+      POSTGRES_PASSWORD: airflow
+      POSTGRES_DB: airflow
+    volumes:
+      - postgres-db-volume:/var/lib/postgresql/data
+    healthcheck:
+      test: ["CMD", "pg_isready", "-U", "airflow"]
+      interval: 10s
+      retries: 5
+      start_period: 5s
+    restart: always
+    networks:
+      - ai-mesh
+
+  redis:
+    image: redis:latest
+    expose:
+      - 6379
+    healthcheck:
+      test: ["CMD", "redis-cli", "ping"]
+      interval: 10s
+      timeout: 30s
+      retries: 50
+      start_period: 30s
+    restart: always
+    networks:
+      - ai-mesh
+
+  airflow-webserver:
+    <<: *airflow-common
+    command: webserver
+    ports:
+      - "8081:8080"
+    healthcheck:
+      test: ["CMD", "curl", "--fail", "http://localhost:8080/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 5
+      start_period: 30s
+    restart: always
+    depends_on:
+      <<: *airflow-common-depends-on
+      airflow-init:
+        condition: service_completed_successfully
+    networks:
+      - ai-mesh
+
+  airflow-scheduler:
+    <<: *airflow-common
+    command: scheduler
+    healthcheck:
+      test: ["CMD", "curl", "--fail", "http://localhost:8974/health"]
+      interval: 30s
+      timeout: 10s
+      retries: 5
+      start_period: 30s
+    restart: always
+    depends_on:
+      <<: *airflow-common-depends-on
+      airflow-init:
+        condition: service_completed_successfully
+    networks:
+      - ai-mesh
+
+  airflow-worker:
+    <<: *airflow-common
+    command: celery worker
+    healthcheck:
+      test:
+        - "CMD-SHELL"
+        - 'celery --app airflow.providers.celery.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}" || celery --app airflow.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}"'
+      interval: 30s
+      timeout: 10s
+      retries: 5
+      start_period: 30s
+    restart: always
+    depends_on:
+      <<: *airflow-common-depends-on
+      airflow-init:
+        condition: service_completed_successfully
+    networks:
+      - ai-mesh
+
+  airflow-triggerer:
+    <<: *airflow-common
+    command: triggerer
+    healthcheck:
+      test: ["CMD-SHELL", 'airflow jobs check --job-type TriggererJob --hostname "$${HOSTNAME}"']
+      interval: 30s
+      timeout: 10s
+      retries: 5
+      start_period: 30s
+    restart: always
+    depends_on:
+      <<: *airflow-common-depends-on
+      airflow-init:
+        condition: service_completed_successfully
+    networks:
+      - ai-mesh
+
+  airflow-init:
+    <<: *airflow-common
+    entrypoint: /bin/bash
+    command:
+      - -c
+      - |
+        if [[ -z "${AIRFLOW_UID}" ]]; then
+          echo "WARNING!!!: AIRFLOW_UID not set!"
+          echo "Using default UID: 50000"
+          export AIRFLOW_UID=50000
+        fi
+        mkdir -p /sources/logs /sources/dags /sources/plugins
+        chown -R "${AIRFLOW_UID}:0" /sources/{logs,dags,plugins}
+        exec /entrypoint airflow version
+    environment:
+      <<: *airflow-common-env
+      _AIRFLOW_DB_MIGRATE: 'true'
+      _AIRFLOW_WWW_USER_CREATE: 'true'
+      _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow}
+      _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow}
+    user: "0:0"
+    volumes:
+      - ${AIRFLOW_PROJ_DIR:-.}:/sources
+    networks:
+      - ai-mesh
+
+  airflow-cli:
+    <<: *airflow-common
+    profiles:
+      - debug
+    environment:
+      <<: *airflow-common-env
+      CONNECTION_CHECK_MAX_COUNT: "0"
+    command:
+      - bash
+      - -c
+      - airflow
+    networks:
+      - ai-mesh
+
+volumes:
+  postgres-db-volume:
+
+networks:
+  ai-mesh:
+    external: true
+