Complete guide to GitHub Models integration via Azure AI Inference
Aurelis exclusively uses GitHub Models through Azure AI Inference, providing enterprise-grade AI capabilities with a single authentication token and unified API access.
GitHub Models provides access to cutting-edge AI models through a unified, OpenAI-compatible API via Azure AI Inference. This integration offers:
Aurelis exclusively uses GitHub Models for strategic reasons:
Your GitHub token needs these scopes:
read:user
- Basic user informationread:org
- Organization membership (if applicable)GitHub Profile β Settings β Developer settings β Personal access tokens
β
read:user
β
read:org (if using in organization context)
Visit GitHub Models Marketplace to verify your account has model access.
# Linux/macOS
export GITHUB_TOKEN="ghp_your_token_here"
# Windows PowerShell
$env:GITHUB_TOKEN="ghp_your_token_here"
# Windows Command Prompt
set GITHUB_TOKEN=ghp_your_token_here
# .aurelis.yaml
github_token: "${GITHUB_TOKEN}" # Use environment variable
# Test your token
aurelis models
# Should show:
# β
GitHub Token: ghp_1234...5678
# β
Status: Ready for GitHub models
Model | Provider | Specialty | Context Window | Best For |
---|---|---|---|---|
Codestral-2501 | Mistral | Code generation & optimization | 4K tokens | Production code, refactoring |
GPT-4o | OpenAI | Complex reasoning & multimodal | 4K tokens | Architecture, complex analysis |
GPT-4o-mini | OpenAI | Fast responses & documentation | 4K tokens | Quick tasks, documentation |
Cohere Command-R | Cohere | Documentation & explanations | 4K tokens | Technical writing, explanations |
Cohere Command-R+ | Cohere | Advanced reasoning | 4K tokens | Complex problem solving |
Meta Llama 3.1 70B | Meta | Balanced performance | 4K tokens | General development tasks |
Meta Llama 3.1 405B | Meta | Maximum capability | 4K tokens | Enterprise applications |
Mistral Large | Mistral | Enterprise applications | 4K tokens | Production systems |
Mistral Nemo | Mistral | Fast inference | 4K tokens | Real-time applications |
Code Generation: Codestral-2501 β GPT-4o β Llama 70B
Documentation: Command-R β GPT-4o-mini β Command-R+
Complex Reasoning: GPT-4o β Llama 405B β Mistral Large
Performance Opt: Codestral-2501 β Llama 70B β GPT-4o
Quick Tasks: GPT-4o-mini β Mistral Nemo β Command-R
Aurelis automatically selects the optimal model based on task type:
from aurelis.models import ModelRequest, TaskType
# Code generation automatically uses Codestral-2501
request = ModelRequest(
prompt="Create a REST API for user management",
task_type=TaskType.CODE_GENERATION
)
# Documentation automatically uses Cohere Command-R
request = ModelRequest(
prompt="Document this API endpoint",
task_type=TaskType.DOCUMENTATION
)
from aurelis.models import ModelRequest, ModelType
# Force specific model
request = ModelRequest(
prompt="Optimize this algorithm for performance",
model_type=ModelType.CODESTRAL_2501,
task_type=TaskType.CODE_OPTIMIZATION
)
Primary Model Fails β Automatic Fallback β Retry Logic
β β β
Codestral-2501 β GPT-4o β 3 attempts
GPT-4o β Llama 405B β 3 attempts
Command-R β GPT-4o-mini β 3 attempts
from aurelis.models import get_model_orchestrator, ModelRequest, TaskType
# Initialize orchestrator
orchestrator = get_model_orchestrator()
# Create request
request = ModelRequest(
prompt="Generate a Python function to validate email addresses",
task_type=TaskType.CODE_GENERATION,
temperature=0.1, # Deterministic for code
max_tokens=1000
)
# Send request
response = await orchestrator.send_request(request)
print(response.content)
request = ModelRequest(
prompt="Optimize this database query",
task_type=TaskType.CODE_OPTIMIZATION,
system_prompt="""You are a database performance expert.
Focus on query efficiency and index usage.
Provide detailed explanations for optimizations.""",
temperature=0.1,
max_tokens=2000,
metadata={
"project": "aurelis",
"database": "postgresql",
"user_id": "developer_123"
}
)
requests = [
ModelRequest(prompt="Generate user model", task_type=TaskType.CODE_GENERATION),
ModelRequest(prompt="Generate user controller", task_type=TaskType.CODE_GENERATION),
ModelRequest(prompt="Generate user tests", task_type=TaskType.TESTING)
]
responses = await orchestrator.batch_request(requests)
for response in responses:
print(f"Model used: {response.model_used}")
print(f"Content: {response.content[:100]}...")
# First request hits the model
response1 = await orchestrator.send_request(request)
print(f"Cached: {response1.cached}") # False
# Identical request hits cache
response2 = await orchestrator.send_request(request)
print(f"Cached: {response2.cached}") # True
# .aurelis.yaml
cache:
enabled: true
ttl: 3600 # 1 hour
max_size: 1000
strategy: "lru" # Least recently used
# Monitor token usage
response = await orchestrator.send_request(request)
print(f"Tokens used: {response.token_usage}")
print(f"Processing time: {response.processing_time:.2f}s")
# Optimize for cost
request = ModelRequest(
prompt="Brief explanation of Python decorators",
max_tokens=200, # Limit response length
temperature=0.1 # Deterministic responses
)
# Get model statistics
stats = orchestrator.get_model_stats()
print(f"Available models: {stats['available_models']}")
print(f"Task mappings: {stats['task_mappings']}")
# Health check
health = orchestrator.health_check()
print(f"Overall status: {health['overall_status']}")
for model, status in health['models'].items():
print(f"{model}: {status['status']}")
# Environment variable (recommended)
github_token: "${GITHUB_TOKEN}"
# System keyring (enterprise)
security:
token_storage: "keyring"
keyring_service: "aurelis"
keyring_username: "github_models"
# Regular token rotation (every 90 days)
1. Generate new GitHub token
2. Update environment variable
3. Restart Aurelis services
4. Revoke old token
# .aurelis.yaml
security:
audit_logging: true
log_level: "INFO"
log_requests: true
log_responses: false # Never log response content
# Enterprise configuration
security:
compliance_mode: true
data_retention_days: 90
encryption_at_rest: true
secure_headers: true
# Secure request headers
request = ModelRequest(
prompt="Generate secure authentication code",
metadata={
"security_level": "high",
"compliance": "sox_hip",
"data_classification": "confidential"
}
)
Error: GitHub token not found
# Verify environment variable
echo $GITHUB_TOKEN # Linux/macOS
echo $env:GITHUB_TOKEN # Windows PowerShell
# Set if missing
export GITHUB_TOKEN="your_token_here"
Error: Invalid token format
# GitHub tokens start with 'ghp_'
β
Good: ghp_1234567890abcdef...
β Bad: gho_1234567890abcdef...
Error: Model not available
# Check model status
aurelis models
# Expected output:
# β
Codestral-2501: Available
# β
GPT-4o: Available
Error: Rate limit exceeded
# Check rate limits in response metadata
if 'rate_limit_remaining' in response.metadata:
print(f"Remaining: {response.metadata['rate_limit_remaining']}")
print(f"Reset time: {response.metadata['rate_limit_reset']}")
Issue: Slow response times
# Optimize configuration
processing:
timeout: 30 # Reduce timeout
concurrent_requests: 3 # Reduce concurrency
cache:
enabled: true # Enable caching
ttl: 1800 # 30 minutes
Issue: High token usage
# Optimize requests
request = ModelRequest(
prompt="Brief: " + your_prompt, # Add "Brief:" prefix
max_tokens=500, # Limit response
temperature=0.1 # Deterministic
)
# Full health check
aurelis health
# Model availability
aurelis models --verbose
# Token validation
aurelis auth status
# Configuration check
aurelis config validate
# .aurelis.yaml
logging:
level: "DEBUG"
handlers:
- "console"
- "file"
file_path: "aurelis_debug.log"
# Use task-specific models
CODE_TASKS = [TaskType.CODE_GENERATION, TaskType.CODE_OPTIMIZATION]
DOCS_TASKS = [TaskType.DOCUMENTATION, TaskType.EXPLANATIONS]
COMPLEX_TASKS = [TaskType.ARCHITECTURAL_DECISIONS, TaskType.COMPLEX_REASONING]
# Let Aurelis choose automatically
request = ModelRequest(
prompt=your_prompt,
task_type=detected_task_type # Aurelis selects optimal model
)
# Use appropriate models for task complexity
simple_request = ModelRequest(
prompt="Fix this typo",
model_type=ModelType.GPT_4O_MINI # Cheaper for simple tasks
)
complex_request = ModelRequest(
prompt="Design microservices architecture",
model_type=ModelType.GPT_4O # Worth the cost for complex tasks
)
# Cache documentation requests
docs_request = ModelRequest(
prompt="Document this function",
task_type=TaskType.DOCUMENTATION
# Caching enabled by default
)
# Disable cache for unique generation
unique_request = ModelRequest(
prompt=f"Generate unique UUID implementation {timestamp}",
metadata={"cache_disabled": True}
)
async def robust_request(prompt: str) -> str:
try:
request = ModelRequest(prompt=prompt)
response = await orchestrator.send_request(request)
return response.content
except ModelError as e:
logger.error(f"Model request failed: {e}")
# Fallback logic
return await fallback_handler(prompt)
except Exception as e:
logger.error(f"Unexpected error: {e}")
return "Error: Unable to process request"
# Track usage patterns
response = await orchestrator.send_request(request)
# Log metrics
metrics = {
"model_used": response.model_used,
"tokens_used": response.token_usage.get("total_tokens", 0),
"processing_time": response.processing_time,
"cached": response.cached,
"confidence": response.confidence
}
analytics_logger.info(json.dumps(metrics))
# Production security configuration
security:
token_storage: "keyring"
audit_logging: true
request_timeout: 30
max_retries: 3
# Never log sensitive data
logging:
exclude_patterns:
- "github_token"
- "api_key"
- "password"
request = ModelRequest(
prompt="""
Create a Python class for user authentication with:
- Email/password login
- Token-based sessions
- Rate limiting
- Audit logging
""",
task_type=TaskType.CODE_GENERATION,
system_prompt="Generate production-ready code with proper error handling",
temperature=0.1
)
response = await orchestrator.send_request(request)
request = ModelRequest(
prompt=f"""
Document this function:
{function_code}
Include: purpose, parameters, return value, examples, and edge cases.
""",
task_type=TaskType.DOCUMENTATION,
model_type=ModelType.COHERE_COMMAND_R
)
request = ModelRequest(
prompt=f"""
Review and optimize this code for:
- Performance improvements
- Security vulnerabilities
- Code style and best practices
{code_to_review}
""",
task_type=TaskType.CODE_OPTIMIZATION,
system_prompt="You are a senior software engineer conducting a code review"
)
# Override default routing
custom_orchestrator = GitHubModelOrchestrator()
custom_orchestrator.task_model_mapping[TaskType.CODE_GENERATION] = [
ModelType.GPT_4O, # Use GPT-4o first
ModelType.CODESTRAL_2501 # Fallback to Codestral
]
# Stream responses for long generations
async for chunk in orchestrator.stream_request(request):
print(chunk.content, end="", flush=True)
# Efficient batch processing
requests = [
ModelRequest(prompt=p, task_type=TaskType.CODE_GENERATION)
for p in prompts
]
# Process in parallel with rate limiting
responses = await orchestrator.batch_request(
requests,
max_concurrent=5,
rate_limit=10 # requests per second
)
@app.command()
def generate(
prompt: str = typer.Argument(..., help="Generation prompt"),
model: Optional[str] = typer.Option(None, help="Specific model to use")
):
"""Generate code using GitHub models."""
orchestrator = get_model_orchestrator()
request = ModelRequest(
prompt=prompt,
model_type=ModelType(model) if model else None,
task_type=TaskType.CODE_GENERATION
)
response = asyncio.run(orchestrator.send_request(request))
console.print(response.content)
class AurelisShell:
def __init__(self):
self.orchestrator = get_model_orchestrator()
async def process_input(self, user_input: str) -> str:
task_type = self._detect_task_type(user_input)
request = ModelRequest(
prompt=user_input,
task_type=task_type,
temperature=0.1
)
response = await self.orchestrator.send_request(request)
return response.content
from fastapi import FastAPI, HTTPException
app = FastAPI()
orchestrator = get_model_orchestrator()
@app.post("/api/generate")
async def generate_code(request: GenerationRequest):
try:
model_request = ModelRequest(
prompt=request.prompt,
task_type=request.task_type,
temperature=request.temperature or 0.1
)
response = await orchestrator.send_request(model_request)
return {
"content": response.content,
"model_used": response.model_used,
"tokens_used": response.token_usage,
"processing_time": response.processing_time
}
except ModelError as e:
raise HTTPException(status_code=500, detail=str(e))
Last Updated: December 2024
Version: 2.0.0
Author: Gamecooler19 (Lead Developer at Kanopus)
Aurelis - Where AI meets enterprise code development