Initial commit: Multi-service AI agent system
- Frontend: Vite + React + TypeScript chat interface - Backend: FastAPI gateway with LangGraph routing - Knowledge Service: ChromaDB RAG with Gitea scraper - LangGraph Service: Multi-agent orchestration - Airflow: Scheduled Gitea ingestion DAG - Documentation: Complete plan and implementation guides Architecture: - Modular Docker Compose per service - External ai-mesh network for communication - Fast rebuilds with /app/packages pattern - Intelligent agent routing (no hardcoded keywords) Services: - Frontend (5173): React chat UI - Chat Gateway (8000): FastAPI entry point - LangGraph (8090): Agent orchestration - Knowledge (8080): ChromaDB RAG - Airflow (8081): Scheduled ingestion - PostgreSQL (5432): Chat history Excludes: node_modules, .venv, chroma_db, logs, .env files Includes: All source code, configs, docs, docker files
This commit is contained in:
144
airflow/dags/gitea_ingestion_dag.py
Normal file
144
airflow/dags/gitea_ingestion_dag.py
Normal file
@@ -0,0 +1,144 @@
|
||||
"""
|
||||
Airflow DAG for scheduled Gitea repository ingestion.
|
||||
Runs daily to fetch new/updated repos and ingest into ChromaDB.
|
||||
"""
|
||||
from datetime import datetime, timedelta
|
||||
from airflow import DAG
|
||||
from airflow.operators.python import PythonOperator
|
||||
from airflow.providers.http.operators.http import SimpleHttpOperator
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
|
||||
# Add knowledge_service to path for imports
|
||||
sys.path.insert(0, '/opt/airflow/dags/repo')
|
||||
|
||||
default_args = {
|
||||
'owner': 'airflow',
|
||||
'depends_on_past': False,
|
||||
'email_on_failure': False,
|
||||
'email_on_retry': False,
|
||||
'retries': 1,
|
||||
'retry_delay': timedelta(minutes=5),
|
||||
}
|
||||
|
||||
def fetch_gitea_repos(**context):
|
||||
"""Task: Fetch all repositories from Gitea."""
|
||||
from gitea_scraper import GiteaScraper
|
||||
|
||||
scraper = GiteaScraper(
|
||||
base_url=os.getenv("GITEA_URL", "https://gitea.lab.audasmedia.com.au"),
|
||||
token=os.getenv("GITEA_TOKEN", ""),
|
||||
username=os.getenv("GITEA_USERNAME", "sam")
|
||||
)
|
||||
|
||||
repos = scraper.get_user_repos()
|
||||
|
||||
# Push to XCom for downstream tasks
|
||||
context['ti'].xcom_push(key='repo_count', value=len(repos))
|
||||
context['ti'].xcom_push(key='repos', value=[
|
||||
{
|
||||
'name': r.name,
|
||||
'description': r.description,
|
||||
'url': r.url,
|
||||
'updated_at': r.updated_at
|
||||
}
|
||||
for r in repos
|
||||
])
|
||||
|
||||
return f"Fetched {len(repos)} repositories"
|
||||
|
||||
def fetch_readmes(**context):
|
||||
"""Task: Fetch READMEs for all repositories."""
|
||||
from gitea_scraper import GiteaScraper
|
||||
|
||||
ti = context['ti']
|
||||
repos = ti.xcom_pull(task_ids='fetch_repos', key='repos')
|
||||
|
||||
scraper = GiteaScraper(
|
||||
base_url=os.getenv("GITEA_URL", "https://gitea.lab.audasmedia.com.au"),
|
||||
token=os.getenv("GITEA_TOKEN", ""),
|
||||
username=os.getenv("GITEA_USERNAME", "sam")
|
||||
)
|
||||
|
||||
readme_data = []
|
||||
for repo in repos[:10]: # Limit to 10 repos per run for testing
|
||||
readme = scraper.get_readme(repo['name'])
|
||||
if readme:
|
||||
readme_data.append({
|
||||
'repo': repo['name'],
|
||||
'content': readme[:5000], # First 5000 chars
|
||||
'url': repo['url']
|
||||
})
|
||||
|
||||
ti.xcom_push(key='readme_data', value=readme_data)
|
||||
|
||||
return f"Fetched {len(readme_data)} READMEs"
|
||||
|
||||
def ingest_to_chroma(**context):
|
||||
"""Task: Ingest fetched data into ChromaDB via knowledge service."""
|
||||
import httpx
|
||||
|
||||
ti = context['ti']
|
||||
readme_data = ti.xcom_pull(task_ids='fetch_readmes', key='readme_data')
|
||||
|
||||
knowledge_service_url = os.getenv("KNOWLEDGE_SERVICE_URL", "http://knowledge-service:8080")
|
||||
|
||||
documents_ingested = 0
|
||||
for item in readme_data:
|
||||
try:
|
||||
# Call knowledge service ingest endpoint
|
||||
response = httpx.post(
|
||||
f"{knowledge_service_url}/ingest",
|
||||
json={
|
||||
'source': f"gitea:{item['repo']}",
|
||||
'content': item['content'],
|
||||
'metadata': {
|
||||
'repo': item['repo'],
|
||||
'url': item['url'],
|
||||
'type': 'readme'
|
||||
}
|
||||
},
|
||||
timeout=30.0
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
documents_ingested += 1
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error ingesting {item['repo']}: {e}")
|
||||
|
||||
return f"Ingested {documents_ingested} documents into ChromaDB"
|
||||
|
||||
# Define the DAG
|
||||
with DAG(
|
||||
'gitea_daily_ingestion',
|
||||
default_args=default_args,
|
||||
description='Daily ingestion of Gitea repositories into knowledge base',
|
||||
schedule_interval=timedelta(days=1), # Run daily
|
||||
start_date=datetime(2024, 1, 1),
|
||||
catchup=False,
|
||||
tags=['gitea', 'ingestion', 'knowledge'],
|
||||
) as dag:
|
||||
|
||||
# Task 1: Fetch repository list
|
||||
fetch_repos_task = PythonOperator(
|
||||
task_id='fetch_repos',
|
||||
python_callable=fetch_gitea_repos,
|
||||
)
|
||||
|
||||
# Task 2: Fetch README content
|
||||
fetch_readmes_task = PythonOperator(
|
||||
task_id='fetch_readmes',
|
||||
python_callable=fetch_readmes,
|
||||
)
|
||||
|
||||
# Task 3: Ingest into ChromaDB
|
||||
ingest_task = PythonOperator(
|
||||
task_id='ingest_to_chroma',
|
||||
python_callable=ingest_to_chroma,
|
||||
)
|
||||
|
||||
# Define task dependencies
|
||||
fetch_repos_task >> fetch_readmes_task >> ingest_task
|
||||
|
||||
121
airflow/dags/gitea_scraper.py
Normal file
121
airflow/dags/gitea_scraper.py
Normal file
@@ -0,0 +1,121 @@
|
||||
import os
|
||||
import httpx
|
||||
import logging
|
||||
from typing import List, Dict, Optional
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@dataclass
|
||||
class RepoMetadata:
|
||||
name: str
|
||||
description: str
|
||||
url: str
|
||||
default_branch: str
|
||||
updated_at: str
|
||||
language: Optional[str]
|
||||
|
||||
class GiteaScraper:
|
||||
def __init__(self, base_url: str, token: str, username: str = "sam"):
|
||||
self.base_url = base_url.rstrip("/")
|
||||
self.token = token
|
||||
self.username = username
|
||||
self.headers = {"Authorization": f"token {token}"}
|
||||
|
||||
def get_user_repos(self) -> List[RepoMetadata]:
|
||||
"""Fetch all repositories for the user."""
|
||||
repos = []
|
||||
page = 1
|
||||
|
||||
while True:
|
||||
url = f"{self.base_url}/api/v1/users/{self.username}/repos?page={page}&limit=50"
|
||||
|
||||
try:
|
||||
response = httpx.get(url, headers=self.headers, timeout=30.0)
|
||||
response.raise_for_status()
|
||||
|
||||
data = response.json()
|
||||
if not data:
|
||||
break
|
||||
|
||||
for repo in data:
|
||||
repos.append(RepoMetadata(
|
||||
name=repo["name"],
|
||||
description=repo.get("description", ""),
|
||||
url=repo["html_url"],
|
||||
default_branch=repo["default_branch"],
|
||||
updated_at=repo["updated_at"],
|
||||
language=repo.get("language")
|
||||
))
|
||||
|
||||
logger.info(f"Fetched page {page}, got {len(data)} repos")
|
||||
page += 1
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error fetching repos: {e}")
|
||||
break
|
||||
|
||||
return repos
|
||||
|
||||
def get_readme(self, repo_name: str) -> str:
|
||||
"""Fetch README content for a repository."""
|
||||
# Try common README filenames
|
||||
readme_names = ["README.md", "readme.md", "Readme.md", "README.rst"]
|
||||
|
||||
for readme_name in readme_names:
|
||||
url = f"{self.base_url}/api/v1/repos/{self.username}/{repo_name}/raw/{readme_name}"
|
||||
|
||||
try:
|
||||
response = httpx.get(url, headers=self.headers, timeout=10.0)
|
||||
if response.status_code == 200:
|
||||
return response.text
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to fetch {readme_name}: {e}")
|
||||
continue
|
||||
|
||||
return ""
|
||||
|
||||
def get_repo_files(self, repo_name: str, path: str = "") -> List[Dict]:
|
||||
"""List files in a repository directory."""
|
||||
url = f"{self.base_url}/api/v1/repos/{self.username}/{repo_name}/contents/{path}"
|
||||
|
||||
try:
|
||||
response = httpx.get(url, headers=self.headers, timeout=10.0)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
except Exception as e:
|
||||
logger.error(f"Error listing files in {repo_name}/{path}: {e}")
|
||||
return []
|
||||
|
||||
def get_file_content(self, repo_name: str, filepath: str) -> str:
|
||||
"""Fetch content of a specific file."""
|
||||
url = f"{self.base_url}/api/v1/repos/{self.username}/{repo_name}/raw/{filepath}"
|
||||
|
||||
try:
|
||||
response = httpx.get(url, headers=self.headers, timeout=10.0)
|
||||
if response.status_code == 200:
|
||||
return response.text
|
||||
except Exception as e:
|
||||
logger.error(f"Error fetching file {filepath}: {e}")
|
||||
|
||||
return ""
|
||||
|
||||
# Test function
|
||||
if __name__ == "__main__":
|
||||
scraper = GiteaScraper(
|
||||
base_url=os.getenv("GITEA_URL", "https://gitea.lab.audasmedia.com.au"),
|
||||
token=os.getenv("GITEA_TOKEN", ""),
|
||||
username=os.getenv("GITEA_USERNAME", "sam")
|
||||
)
|
||||
|
||||
repos = scraper.get_user_repos()
|
||||
print(f"Found {len(repos)} repositories")
|
||||
|
||||
for repo in repos[:3]: # Test with first 3
|
||||
print(f"\nRepo: {repo.name}")
|
||||
readme = scraper.get_readme(repo.name)
|
||||
if readme:
|
||||
print(f"README preview: {readme[:200]}...")
|
||||
|
||||
Reference in New Issue
Block a user