Initial commit: Multi-service AI agent system

- Frontend: Vite + React + TypeScript chat interface - Backend: FastAPI gateway with LangGraph routing - Knowledge Service: ChromaDB RAG with Gitea scraper - LangGraph Service: Multi-agent orchestration - Airflow: Scheduled Gitea ingestion DAG - Documentation: Complete plan and implementation guides Architecture: - Modular Docker Compose per service - External ai-mesh network for communication - Fast rebuilds with /app/packages pattern - Intelligent agent routing (no hardcoded keywords) Services: - Frontend (5173): React chat UI - Chat Gateway (8000): FastAPI entry point - LangGraph (8090): Agent orchestration - Knowledge (8080): ChromaDB RAG - Airflow (8081): Scheduled ingestion - PostgreSQL (5432): Chat history Excludes: node_modules, .venv, chroma_db, logs, .env files Includes: All source code, configs, docs, docker files
2026-02-27 19:51:06 +11:00
commit 628ba96998
44 changed files with 7177 additions and 0 deletions
--- a/airflow/dags/gitea_ingestion_dag.py
+++ b/airflow/dags/gitea_ingestion_dag.py
@@ -0,0 +1,144 @@
+"""
+Airflow DAG for scheduled Gitea repository ingestion.
+Runs daily to fetch new/updated repos and ingest into ChromaDB.
+"""
+from datetime import datetime, timedelta
+from airflow import DAG
+from airflow.operators.python import PythonOperator
+from airflow.providers.http.operators.http import SimpleHttpOperator
+import os
+import sys
+import json
+
+# Add knowledge_service to path for imports
+sys.path.insert(0, '/opt/airflow/dags/repo')
+
+default_args = {
+    'owner': 'airflow',
+    'depends_on_past': False,
+    'email_on_failure': False,
+    'email_on_retry': False,
+    'retries': 1,
+    'retry_delay': timedelta(minutes=5),
+}
+
+def fetch_gitea_repos(**context):
+    """Task: Fetch all repositories from Gitea."""
+    from gitea_scraper import GiteaScraper
+    
+    scraper = GiteaScraper(
+        base_url=os.getenv("GITEA_URL", "https://gitea.lab.audasmedia.com.au"),
+        token=os.getenv("GITEA_TOKEN", ""),
+        username=os.getenv("GITEA_USERNAME", "sam")
+    )
+    
+    repos = scraper.get_user_repos()
+    
+    # Push to XCom for downstream tasks
+    context['ti'].xcom_push(key='repo_count', value=len(repos))
+    context['ti'].xcom_push(key='repos', value=[
+        {
+            'name': r.name,
+            'description': r.description,
+            'url': r.url,
+            'updated_at': r.updated_at
+        }
+        for r in repos
+    ])
+    
+    return f"Fetched {len(repos)} repositories"
+
+def fetch_readmes(**context):
+    """Task: Fetch READMEs for all repositories."""
+    from gitea_scraper import GiteaScraper
+    
+    ti = context['ti']
+    repos = ti.xcom_pull(task_ids='fetch_repos', key='repos')
+    
+    scraper = GiteaScraper(
+        base_url=os.getenv("GITEA_URL", "https://gitea.lab.audasmedia.com.au"),
+        token=os.getenv("GITEA_TOKEN", ""),
+        username=os.getenv("GITEA_USERNAME", "sam")
+    )
+    
+    readme_data = []
+    for repo in repos[:10]:  # Limit to 10 repos per run for testing
+        readme = scraper.get_readme(repo['name'])
+        if readme:
+            readme_data.append({
+                'repo': repo['name'],
+                'content': readme[:5000],  # First 5000 chars
+                'url': repo['url']
+            })
+    
+    ti.xcom_push(key='readme_data', value=readme_data)
+    
+    return f"Fetched {len(readme_data)} READMEs"
+
+def ingest_to_chroma(**context):
+    """Task: Ingest fetched data into ChromaDB via knowledge service."""
+    import httpx
+    
+    ti = context['ti']
+    readme_data = ti.xcom_pull(task_ids='fetch_readmes', key='readme_data')
+    
+    knowledge_service_url = os.getenv("KNOWLEDGE_SERVICE_URL", "http://knowledge-service:8080")
+    
+    documents_ingested = 0
+    for item in readme_data:
+        try:
+            # Call knowledge service ingest endpoint
+            response = httpx.post(
+                f"{knowledge_service_url}/ingest",
+                json={
+                    'source': f"gitea:{item['repo']}",
+                    'content': item['content'],
+                    'metadata': {
+                        'repo': item['repo'],
+                        'url': item['url'],
+                        'type': 'readme'
+                    }
+                },
+                timeout=30.0
+            )
+            
+            if response.status_code == 200:
+                documents_ingested += 1
+                
+        except Exception as e:
+            print(f"Error ingesting {item['repo']}: {e}")
+    
+    return f"Ingested {documents_ingested} documents into ChromaDB"
+
+# Define the DAG
+with DAG(
+    'gitea_daily_ingestion',
+    default_args=default_args,
+    description='Daily ingestion of Gitea repositories into knowledge base',
+    schedule_interval=timedelta(days=1),  # Run daily
+    start_date=datetime(2024, 1, 1),
+    catchup=False,
+    tags=['gitea', 'ingestion', 'knowledge'],
+) as dag:
+    
+    # Task 1: Fetch repository list
+    fetch_repos_task = PythonOperator(
+        task_id='fetch_repos',
+        python_callable=fetch_gitea_repos,
+    )
+    
+    # Task 2: Fetch README content
+    fetch_readmes_task = PythonOperator(
+        task_id='fetch_readmes',
+        python_callable=fetch_readmes,
+    )
+    
+    # Task 3: Ingest into ChromaDB
+    ingest_task = PythonOperator(
+        task_id='ingest_to_chroma',
+        python_callable=ingest_to_chroma,
+    )
+    
+    # Define task dependencies
+    fetch_repos_task >> fetch_readmes_task >> ingest_task
+