Moved updated services from /home/sam/development/ root into aboutme_chat_demo/: - knowledge_service/ (with ChromaDB, gitea_scraper, FastAPI) - langgraph_service/ (with LangGraph agent orchestration) - airflow/ (with DAGs for scheduled ingestion) All services now in single repo location. Modular docker-compose files per service maintained. Removed duplicate nested directories. Updated files reflect latest working versions.
121 lines
4.1 KiB
Python
121 lines
4.1 KiB
Python
import os
|
|
import httpx
|
|
import logging
|
|
from typing import List, Dict, Optional
|
|
from dataclasses import dataclass
|
|
from datetime import datetime
|
|
|
|
logging.basicConfig(level=logging.INFO)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
@dataclass
|
|
class RepoMetadata:
|
|
name: str
|
|
description: str
|
|
url: str
|
|
default_branch: str
|
|
updated_at: str
|
|
language: Optional[str]
|
|
|
|
class GiteaScraper:
|
|
def __init__(self, base_url: str, token: str, username: str = "sam"):
|
|
self.base_url = base_url.rstrip("/")
|
|
self.token = token
|
|
self.username = username
|
|
self.headers = {"Authorization": f"token {token}"}
|
|
|
|
def get_user_repos(self) -> List[RepoMetadata]:
|
|
"""Fetch all repositories for the user."""
|
|
repos = []
|
|
page = 1
|
|
|
|
while True:
|
|
url = f"{self.base_url}/api/v1/users/{self.username}/repos?page={page}&limit=50"
|
|
|
|
try:
|
|
response = httpx.get(url, headers=self.headers, timeout=30.0)
|
|
response.raise_for_status()
|
|
|
|
data = response.json()
|
|
if not data:
|
|
break
|
|
|
|
for repo in data:
|
|
repos.append(RepoMetadata(
|
|
name=repo["name"],
|
|
description=repo.get("description", ""),
|
|
url=repo["html_url"],
|
|
default_branch=repo["default_branch"],
|
|
updated_at=repo["updated_at"],
|
|
language=repo.get("language")
|
|
))
|
|
|
|
logger.info(f"Fetched page {page}, got {len(data)} repos")
|
|
page += 1
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error fetching repos: {e}")
|
|
break
|
|
|
|
return repos
|
|
|
|
def get_readme(self, repo_name: str) -> str:
|
|
"""Fetch README content for a repository."""
|
|
readme_names = ["README.md", "readme.md", "Readme.md", "README.rst"]
|
|
|
|
for readme_name in readme_names:
|
|
url = f"{self.base_url}/api/v1/repos/{self.username}/{repo_name}/raw/{readme_name}"
|
|
|
|
try:
|
|
response = httpx.get(url, headers=self.headers, timeout=10.0)
|
|
if response.status_code == 200:
|
|
return response.text
|
|
except Exception as e:
|
|
logger.warning(f"Failed to fetch {readme_name}: {e}")
|
|
continue
|
|
|
|
return ""
|
|
|
|
def get_repo_files(self, repo_name: str, path: str = "") -> List[Dict]:
|
|
"""List files in a repository directory."""
|
|
url = f"{self.base_url}/api/v1/repos/{self.username}/{repo_name}/contents/{path}"
|
|
|
|
try:
|
|
response = httpx.get(url, headers=self.headers, timeout=10.0)
|
|
response.raise_for_status()
|
|
return response.json()
|
|
except Exception as e:
|
|
logger.error(f"Error listing files in {repo_name}/{path}: {e}")
|
|
return []
|
|
|
|
def get_file_content(self, repo_name: str, filepath: str) -> str:
|
|
"""Fetch content of a specific file."""
|
|
url = f"{self.base_url}/api/v1/repos/{self.username}/{repo_name}/raw/{filepath}"
|
|
|
|
try:
|
|
response = httpx.get(url, headers=self.headers, timeout=10.0)
|
|
if response.status_code == 200:
|
|
return response.text
|
|
except Exception as e:
|
|
logger.error(f"Error fetching file {filepath}: {e}")
|
|
|
|
return ""
|
|
|
|
# Test function
|
|
if __name__ == "__main__":
|
|
scraper = GiteaScraper(
|
|
base_url=os.getenv("GITEA_URL", "https://gitea.lab.audasmedia.com.au"),
|
|
token=os.getenv("GITEA_TOKEN", ""),
|
|
username=os.getenv("GITEA_USERNAME", "sam")
|
|
)
|
|
|
|
repos = scraper.get_user_repos()
|
|
print(f"Found {len(repos)} repositories")
|
|
|
|
for repo in repos[:3]:
|
|
print(f"\nRepo: {repo.name}")
|
|
readme = scraper.get_readme(repo.name)
|
|
if readme:
|
|
print(f"README preview: {readme[:200]}...")
|
|
|