import os import httpx import logging from typing import List, Dict, Optional from dataclasses import dataclass from datetime import datetime logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @dataclass class RepoMetadata: name: str description: str url: str default_branch: str updated_at: str language: Optional[str] class GiteaScraper: def __init__(self, base_url: str, token: str, username: str = "sam"): self.base_url = base_url.rstrip("/") self.token = token self.username = username self.headers = {"Authorization": f"token {token}"} def get_user_repos(self) -> List[RepoMetadata]: """Fetch all repositories for the user.""" repos = [] page = 1 while True: url = f"{self.base_url}/api/v1/users/{self.username}/repos?page={page}&limit=50" try: response = httpx.get(url, headers=self.headers, timeout=30.0) response.raise_for_status() data = response.json() if not data: break for repo in data: repos.append(RepoMetadata( name=repo["name"], description=repo.get("description", ""), url=repo["html_url"], default_branch=repo["default_branch"], updated_at=repo["updated_at"], language=repo.get("language") )) logger.info(f"Fetched page {page}, got {len(data)} repos") page += 1 except Exception as e: logger.error(f"Error fetching repos: {e}") break return repos def get_readme(self, repo_name: str) -> str: """Fetch README content for a repository.""" readme_names = ["README.md", "readme.md", "Readme.md", "README.rst"] for readme_name in readme_names: url = f"{self.base_url}/api/v1/repos/{self.username}/{repo_name}/raw/{readme_name}" try: response = httpx.get(url, headers=self.headers, timeout=10.0) if response.status_code == 200: return response.text except Exception as e: logger.warning(f"Failed to fetch {readme_name}: {e}") continue return "" def get_repo_files(self, repo_name: str, path: str = "") -> List[Dict]: """List files in a repository directory.""" url = f"{self.base_url}/api/v1/repos/{self.username}/{repo_name}/contents/{path}" try: response = httpx.get(url, headers=self.headers, timeout=10.0) response.raise_for_status() return response.json() except Exception as e: logger.error(f"Error listing files in {repo_name}/{path}: {e}") return [] def get_file_content(self, repo_name: str, filepath: str) -> str: """Fetch content of a specific file.""" url = f"{self.base_url}/api/v1/repos/{self.username}/{repo_name}/raw/{filepath}" try: response = httpx.get(url, headers=self.headers, timeout=10.0) if response.status_code == 200: return response.text except Exception as e: logger.error(f"Error fetching file {filepath}: {e}") return "" # Test function if __name__ == "__main__": scraper = GiteaScraper( base_url=os.getenv("GITEA_URL", "https://gitea.lab.audasmedia.com.au"), token=os.getenv("GITEA_TOKEN", ""), username=os.getenv("GITEA_USERNAME", "sam") ) repos = scraper.get_user_repos() print(f"Found {len(repos)} repositories") for repo in repos[:3]: print(f"\nRepo: {repo.name}") readme = scraper.get_readme(repo.name) if readme: print(f"README preview: {readme[:200]}...")