Files
aboutme_chat/airflow/dags/gitea_scraper.py
Sam Rolfe 628ba96998 Initial commit: Multi-service AI agent system
- Frontend: Vite + React + TypeScript chat interface
- Backend: FastAPI gateway with LangGraph routing
- Knowledge Service: ChromaDB RAG with Gitea scraper
- LangGraph Service: Multi-agent orchestration
- Airflow: Scheduled Gitea ingestion DAG
- Documentation: Complete plan and implementation guides

Architecture:
- Modular Docker Compose per service
- External ai-mesh network for communication
- Fast rebuilds with /app/packages pattern
- Intelligent agent routing (no hardcoded keywords)

Services:
- Frontend (5173): React chat UI
- Chat Gateway (8000): FastAPI entry point
- LangGraph (8090): Agent orchestration
- Knowledge (8080): ChromaDB RAG
- Airflow (8081): Scheduled ingestion
- PostgreSQL (5432): Chat history

Excludes: node_modules, .venv, chroma_db, logs, .env files
Includes: All source code, configs, docs, docker files
2026-02-27 19:51:06 +11:00

122 lines
4.1 KiB
Python

import os
import httpx
import logging
from typing import List, Dict, Optional
from dataclasses import dataclass
from datetime import datetime
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
@dataclass
class RepoMetadata:
name: str
description: str
url: str
default_branch: str
updated_at: str
language: Optional[str]
class GiteaScraper:
def __init__(self, base_url: str, token: str, username: str = "sam"):
self.base_url = base_url.rstrip("/")
self.token = token
self.username = username
self.headers = {"Authorization": f"token {token}"}
def get_user_repos(self) -> List[RepoMetadata]:
"""Fetch all repositories for the user."""
repos = []
page = 1
while True:
url = f"{self.base_url}/api/v1/users/{self.username}/repos?page={page}&limit=50"
try:
response = httpx.get(url, headers=self.headers, timeout=30.0)
response.raise_for_status()
data = response.json()
if not data:
break
for repo in data:
repos.append(RepoMetadata(
name=repo["name"],
description=repo.get("description", ""),
url=repo["html_url"],
default_branch=repo["default_branch"],
updated_at=repo["updated_at"],
language=repo.get("language")
))
logger.info(f"Fetched page {page}, got {len(data)} repos")
page += 1
except Exception as e:
logger.error(f"Error fetching repos: {e}")
break
return repos
def get_readme(self, repo_name: str) -> str:
"""Fetch README content for a repository."""
# Try common README filenames
readme_names = ["README.md", "readme.md", "Readme.md", "README.rst"]
for readme_name in readme_names:
url = f"{self.base_url}/api/v1/repos/{self.username}/{repo_name}/raw/{readme_name}"
try:
response = httpx.get(url, headers=self.headers, timeout=10.0)
if response.status_code == 200:
return response.text
except Exception as e:
logger.warning(f"Failed to fetch {readme_name}: {e}")
continue
return ""
def get_repo_files(self, repo_name: str, path: str = "") -> List[Dict]:
"""List files in a repository directory."""
url = f"{self.base_url}/api/v1/repos/{self.username}/{repo_name}/contents/{path}"
try:
response = httpx.get(url, headers=self.headers, timeout=10.0)
response.raise_for_status()
return response.json()
except Exception as e:
logger.error(f"Error listing files in {repo_name}/{path}: {e}")
return []
def get_file_content(self, repo_name: str, filepath: str) -> str:
"""Fetch content of a specific file."""
url = f"{self.base_url}/api/v1/repos/{self.username}/{repo_name}/raw/{filepath}"
try:
response = httpx.get(url, headers=self.headers, timeout=10.0)
if response.status_code == 200:
return response.text
except Exception as e:
logger.error(f"Error fetching file {filepath}: {e}")
return ""
# Test function
if __name__ == "__main__":
scraper = GiteaScraper(
base_url=os.getenv("GITEA_URL", "https://gitea.lab.audasmedia.com.au"),
token=os.getenv("GITEA_TOKEN", ""),
username=os.getenv("GITEA_USERNAME", "sam")
)
repos = scraper.get_user_repos()
print(f"Found {len(repos)} repositories")
for repo in repos[:3]: # Test with first 3
print(f"\nRepo: {repo.name}")
readme = scraper.get_readme(repo.name)
if readme:
print(f"README preview: {readme[:200]}...")