556 lines
19 KiB
Python
556 lines
19 KiB
Python
"""
|
||
FastAPI системные эндпоинты для мониторинга, health checks и администрирования
|
||
TIER 3 - системные функции для операционного управления
|
||
"""
|
||
|
||
import asyncio
|
||
import platform
|
||
import psutil
|
||
import time
|
||
from datetime import datetime, timedelta
|
||
from typing import Dict, List, Optional, Any
|
||
from uuid import UUID
|
||
|
||
from fastapi import APIRouter, HTTPException, Request, Depends, Query
|
||
from fastapi.responses import JSONResponse
|
||
from sqlalchemy import select, text
|
||
|
||
from app.core.config import get_settings
|
||
from app.core.database import db_manager, get_cache_manager
|
||
from app.core.logging import get_logger
|
||
from app.core.crypto import get_ed25519_manager
|
||
from app.core.models.content_models import StoredContent as Content
|
||
from app.core.models.user import User
|
||
from app.api.fastapi_middleware import require_auth, require_admin
|
||
|
||
# Initialize router
|
||
router = APIRouter(prefix="/api/system", tags=["system"])
|
||
logger = get_logger(__name__)
|
||
settings = get_settings()
|
||
|
||
# Системная информация для мониторинга
|
||
_start_time = time.time()
|
||
_request_counter = 0
|
||
_error_counter = 0
|
||
|
||
|
||
@router.get("/health")
|
||
async def health_check():
|
||
"""
|
||
Базовая проверка здоровья сервиса
|
||
Доступна без авторизации для load balancer'ов
|
||
"""
|
||
try:
|
||
# Проверяем подключение к базе данных
|
||
db_status = "unknown"
|
||
try:
|
||
async with db_manager.get_session() as session:
|
||
await session.execute(text("SELECT 1"))
|
||
db_status = "healthy"
|
||
except Exception as e:
|
||
db_status = f"unhealthy: {str(e)[:100]}"
|
||
|
||
# Проверяем кэш
|
||
cache_status = "unknown"
|
||
try:
|
||
cache_manager = await get_cache_manager()
|
||
await cache_manager.set("health_check", "ok", ttl=10)
|
||
cache_status = "healthy"
|
||
except Exception as e:
|
||
cache_status = f"unhealthy: {str(e)[:100]}"
|
||
|
||
# Проверяем криптографию
|
||
crypto_status = "unknown"
|
||
try:
|
||
crypto_manager = get_ed25519_manager()
|
||
test_data = {"test": "health_check"}
|
||
signature = crypto_manager.sign_message(test_data)
|
||
is_valid = crypto_manager.verify_signature(
|
||
test_data, signature, crypto_manager.public_key_hex
|
||
)
|
||
crypto_status = "healthy" if is_valid else "unhealthy: signature verification failed"
|
||
except Exception as e:
|
||
crypto_status = f"unhealthy: {str(e)[:100]}"
|
||
|
||
# Определяем общий статус
|
||
overall_status = "healthy"
|
||
if "unhealthy" in db_status or "unhealthy" in cache_status or "unhealthy" in crypto_status:
|
||
overall_status = "degraded"
|
||
|
||
health_data = {
|
||
"status": overall_status,
|
||
"timestamp": datetime.utcnow().isoformat(),
|
||
"services": {
|
||
"database": db_status,
|
||
"cache": cache_status,
|
||
"cryptography": crypto_status
|
||
},
|
||
"uptime_seconds": int(time.time() - _start_time)
|
||
}
|
||
|
||
# Возвращаем статус с соответствующим HTTP кодом
|
||
status_code = 200 if overall_status == "healthy" else 503
|
||
|
||
return JSONResponse(
|
||
content=health_data,
|
||
status_code=status_code
|
||
)
|
||
|
||
except Exception as e:
|
||
await logger.aerror(
|
||
"Health check failed",
|
||
error=str(e)
|
||
)
|
||
return JSONResponse(
|
||
content={
|
||
"status": "unhealthy",
|
||
"error": "Health check system failure",
|
||
"timestamp": datetime.utcnow().isoformat()
|
||
},
|
||
status_code=503
|
||
)
|
||
|
||
|
||
@router.get("/health/detailed")
|
||
async def detailed_health_check(
|
||
request: Request,
|
||
current_user: User = Depends(require_admin)
|
||
):
|
||
"""
|
||
Детальная проверка здоровья системы с метриками
|
||
Только для администраторов
|
||
"""
|
||
try:
|
||
# Системные метрики
|
||
system_info = {
|
||
"cpu_percent": psutil.cpu_percent(interval=1),
|
||
"memory": {
|
||
"total": psutil.virtual_memory().total,
|
||
"available": psutil.virtual_memory().available,
|
||
"percent": psutil.virtual_memory().percent
|
||
},
|
||
"disk": {
|
||
"total": psutil.disk_usage('/').total,
|
||
"used": psutil.disk_usage('/').used,
|
||
"free": psutil.disk_usage('/').free,
|
||
"percent": psutil.disk_usage('/').percent
|
||
},
|
||
"load_average": psutil.getloadavg() if hasattr(psutil, 'getloadavg') else None
|
||
}
|
||
|
||
# Метрики базы данных
|
||
db_metrics = {}
|
||
try:
|
||
async with db_manager.get_session() as session:
|
||
# Количество пользователей
|
||
user_count = await session.execute(text("SELECT COUNT(*) FROM users"))
|
||
db_metrics["users_count"] = user_count.scalar()
|
||
|
||
# Количество контента
|
||
content_count = await session.execute(text("SELECT COUNT(*) FROM stored_content"))
|
||
db_metrics["content_count"] = content_count.scalar()
|
||
|
||
# Размер базы данных (приблизительно)
|
||
db_size = await session.execute(text("""
|
||
SELECT pg_size_pretty(pg_database_size(current_database()))
|
||
"""))
|
||
db_metrics["database_size"] = db_size.scalar()
|
||
|
||
except Exception as e:
|
||
db_metrics["error"] = str(e)
|
||
|
||
# Метрики кэша
|
||
cache_metrics = {}
|
||
try:
|
||
cache_manager = await get_cache_manager()
|
||
# Здесь добавить метрики Redis если доступны
|
||
cache_metrics["status"] = "connected"
|
||
except Exception as e:
|
||
cache_metrics["error"] = str(e)
|
||
|
||
# Метрики приложения
|
||
app_metrics = {
|
||
"uptime_seconds": int(time.time() - _start_time),
|
||
"requests_total": _request_counter,
|
||
"errors_total": _error_counter,
|
||
"error_rate": _error_counter / max(_request_counter, 1),
|
||
"python_version": platform.python_version(),
|
||
"platform": platform.platform()
|
||
}
|
||
|
||
# Конфигурация
|
||
config_info = {
|
||
"debug_mode": getattr(settings, 'DEBUG', False),
|
||
"environment": getattr(settings, 'ENVIRONMENT', 'unknown'),
|
||
"version": getattr(settings, 'VERSION', 'unknown'),
|
||
"node_id": get_ed25519_manager().node_id[:8] + "..." # Частичный ID для безопасности
|
||
}
|
||
|
||
detailed_health = {
|
||
"status": "healthy",
|
||
"timestamp": datetime.utcnow().isoformat(),
|
||
"system": system_info,
|
||
"database": db_metrics,
|
||
"cache": cache_metrics,
|
||
"application": app_metrics,
|
||
"configuration": config_info
|
||
}
|
||
|
||
return detailed_health
|
||
|
||
except Exception as e:
|
||
await logger.aerror(
|
||
"Detailed health check failed",
|
||
user_id=str(current_user.id),
|
||
error=str(e)
|
||
)
|
||
raise HTTPException(status_code=500, detail="Failed to get detailed health status")
|
||
|
||
|
||
@router.get("/metrics")
|
||
async def prometheus_metrics():
|
||
"""
|
||
Метрики в формате Prometheus
|
||
"""
|
||
try:
|
||
# Базовые метрики системы
|
||
cpu_usage = psutil.cpu_percent(interval=0.1)
|
||
memory = psutil.virtual_memory()
|
||
disk = psutil.disk_usage('/')
|
||
|
||
# Метрики приложения
|
||
uptime = int(time.time() - _start_time)
|
||
|
||
# Формат Prometheus
|
||
metrics = f"""# HELP uploader_bot_uptime_seconds Total uptime in seconds
|
||
# TYPE uploader_bot_uptime_seconds counter
|
||
uploader_bot_uptime_seconds {uptime}
|
||
|
||
# HELP uploader_bot_requests_total Total number of HTTP requests
|
||
# TYPE uploader_bot_requests_total counter
|
||
uploader_bot_requests_total {_request_counter}
|
||
|
||
# HELP uploader_bot_errors_total Total number of errors
|
||
# TYPE uploader_bot_errors_total counter
|
||
uploader_bot_errors_total {_error_counter}
|
||
|
||
# HELP system_cpu_usage_percent CPU usage percentage
|
||
# TYPE system_cpu_usage_percent gauge
|
||
system_cpu_usage_percent {cpu_usage}
|
||
|
||
# HELP system_memory_usage_percent Memory usage percentage
|
||
# TYPE system_memory_usage_percent gauge
|
||
system_memory_usage_percent {memory.percent}
|
||
|
||
# HELP system_disk_usage_percent Disk usage percentage
|
||
# TYPE system_disk_usage_percent gauge
|
||
system_disk_usage_percent {disk.percent}
|
||
|
||
# HELP system_memory_total_bytes Total memory in bytes
|
||
# TYPE system_memory_total_bytes gauge
|
||
system_memory_total_bytes {memory.total}
|
||
|
||
# HELP system_memory_available_bytes Available memory in bytes
|
||
# TYPE system_memory_available_bytes gauge
|
||
system_memory_available_bytes {memory.available}
|
||
"""
|
||
|
||
return JSONResponse(
|
||
content=metrics,
|
||
media_type="text/plain"
|
||
)
|
||
|
||
except Exception as e:
|
||
await logger.aerror(
|
||
"Metrics collection failed",
|
||
error=str(e)
|
||
)
|
||
raise HTTPException(status_code=500, detail="Failed to collect metrics")
|
||
|
||
|
||
@router.get("/info")
|
||
async def system_info():
|
||
"""
|
||
Общая информация о системе (публичная)
|
||
"""
|
||
try:
|
||
crypto_manager = get_ed25519_manager()
|
||
|
||
info = {
|
||
"service": "uploader-bot",
|
||
"version": getattr(settings, 'VERSION', 'unknown'),
|
||
"api_version": "v1",
|
||
"network": "MY Network v3.0",
|
||
"node_id": crypto_manager.node_id,
|
||
"public_key": crypto_manager.public_key_hex,
|
||
"capabilities": [
|
||
"content_upload",
|
||
"content_sync",
|
||
"decentralized_filtering",
|
||
"ed25519_signatures",
|
||
"web2_client_api"
|
||
],
|
||
"supported_formats": [
|
||
"image/*",
|
||
"video/*",
|
||
"audio/*",
|
||
"text/*",
|
||
"application/pdf"
|
||
],
|
||
"max_file_size": getattr(settings, 'MAX_FILE_SIZE', 100 * 1024 * 1024),
|
||
"timestamp": datetime.utcnow().isoformat()
|
||
}
|
||
|
||
return info
|
||
|
||
except Exception as e:
|
||
await logger.aerror(
|
||
"System info failed",
|
||
error=str(e)
|
||
)
|
||
raise HTTPException(status_code=500, detail="Failed to get system information")
|
||
|
||
|
||
@router.get("/stats")
|
||
async def system_statistics(
|
||
request: Request,
|
||
current_user: User = Depends(require_auth),
|
||
days: int = Query(7, ge=1, le=30, description="Number of days for statistics")
|
||
):
|
||
"""
|
||
Статистика системы за указанный период
|
||
"""
|
||
try:
|
||
since_date = datetime.utcnow() - timedelta(days=days)
|
||
|
||
# Статистика из базы данных
|
||
stats = {}
|
||
|
||
async with db_manager.get_session() as session:
|
||
# Общая статистика контента
|
||
content_stats = await session.execute(text("""
|
||
SELECT
|
||
COUNT(*) as total_content,
|
||
SUM(CASE WHEN created_at >= :since_date THEN 1 ELSE 0 END) as new_content,
|
||
SUM(file_size) as total_size,
|
||
AVG(file_size) as avg_size
|
||
FROM stored_content
|
||
"""), {"since_date": since_date})
|
||
|
||
content_row = content_stats.fetchone()
|
||
|
||
stats["content"] = {
|
||
"total_items": content_row.total_content or 0,
|
||
"new_items": content_row.new_content or 0,
|
||
"total_size_bytes": content_row.total_size or 0,
|
||
"average_size_bytes": float(content_row.avg_size or 0)
|
||
}
|
||
|
||
# Статистика пользователей
|
||
user_stats = await session.execute(text("""
|
||
SELECT
|
||
COUNT(*) as total_users,
|
||
SUM(CASE WHEN created_at >= :since_date THEN 1 ELSE 0 END) as new_users
|
||
FROM users
|
||
"""), {"since_date": since_date})
|
||
|
||
user_row = user_stats.fetchone()
|
||
|
||
stats["users"] = {
|
||
"total_users": user_row.total_users or 0,
|
||
"new_users": user_row.new_users or 0
|
||
}
|
||
|
||
# Системная статистика
|
||
stats["system"] = {
|
||
"uptime_seconds": int(time.time() - _start_time),
|
||
"requests_handled": _request_counter,
|
||
"errors_occurred": _error_counter,
|
||
"period_days": days,
|
||
"generated_at": datetime.utcnow().isoformat()
|
||
}
|
||
|
||
return stats
|
||
|
||
except Exception as e:
|
||
await logger.aerror(
|
||
"Statistics generation failed",
|
||
user_id=str(current_user.id),
|
||
error=str(e)
|
||
)
|
||
raise HTTPException(status_code=500, detail="Failed to generate statistics")
|
||
|
||
|
||
@router.post("/maintenance")
|
||
async def toggle_maintenance_mode(
|
||
request: Request,
|
||
enabled: bool = Query(description="Enable or disable maintenance mode"),
|
||
current_user: User = Depends(require_admin)
|
||
):
|
||
"""
|
||
Включение/отключение режима обслуживания
|
||
Только для администраторов
|
||
"""
|
||
try:
|
||
cache_manager = await get_cache_manager()
|
||
|
||
if enabled:
|
||
maintenance_info = {
|
||
"enabled": True,
|
||
"enabled_at": datetime.utcnow().isoformat(),
|
||
"enabled_by": str(current_user.id),
|
||
"message": "System is under maintenance. Please try again later."
|
||
}
|
||
await cache_manager.set("maintenance_mode", maintenance_info, ttl=86400) # 24 часа
|
||
|
||
await logger.awarning(
|
||
"Maintenance mode enabled",
|
||
admin_id=str(current_user.id)
|
||
)
|
||
|
||
return {
|
||
"message": "Maintenance mode enabled",
|
||
"maintenance_info": maintenance_info
|
||
}
|
||
else:
|
||
await cache_manager.delete("maintenance_mode")
|
||
|
||
await logger.ainfo(
|
||
"Maintenance mode disabled",
|
||
admin_id=str(current_user.id)
|
||
)
|
||
|
||
return {
|
||
"message": "Maintenance mode disabled"
|
||
}
|
||
|
||
except Exception as e:
|
||
await logger.aerror(
|
||
"Maintenance mode toggle failed",
|
||
admin_id=str(current_user.id),
|
||
error=str(e)
|
||
)
|
||
raise HTTPException(status_code=500, detail="Failed to toggle maintenance mode")
|
||
|
||
|
||
@router.get("/logs")
|
||
async def get_system_logs(
|
||
request: Request,
|
||
current_user: User = Depends(require_admin),
|
||
level: str = Query("INFO", description="Log level filter"),
|
||
lines: int = Query(100, ge=1, le=1000, description="Number of lines to return"),
|
||
component: Optional[str] = Query(None, description="Filter by component")
|
||
):
|
||
"""
|
||
Получение системных логов
|
||
Только для администраторов
|
||
"""
|
||
try:
|
||
# Здесь должна быть реализация чтения логов
|
||
# В реальной системе это может быть подключение к логгеру или файловой системе
|
||
|
||
# Заглушка для демонстрации
|
||
logs = [
|
||
{
|
||
"timestamp": datetime.utcnow().isoformat(),
|
||
"level": "INFO",
|
||
"component": "system",
|
||
"message": "System logs endpoint accessed",
|
||
"user_id": str(current_user.id)
|
||
}
|
||
]
|
||
|
||
return {
|
||
"logs": logs,
|
||
"total_lines": len(logs),
|
||
"filters": {
|
||
"level": level,
|
||
"lines": lines,
|
||
"component": component
|
||
},
|
||
"generated_at": datetime.utcnow().isoformat()
|
||
}
|
||
|
||
except Exception as e:
|
||
await logger.aerror(
|
||
"Log retrieval failed",
|
||
admin_id=str(current_user.id),
|
||
error=str(e)
|
||
)
|
||
raise HTTPException(status_code=500, detail="Failed to retrieve logs")
|
||
|
||
|
||
# Middleware для подсчета запросов (будет использоваться в главном приложении)
|
||
async def increment_request_counter():
|
||
"""Увеличение счетчика запросов"""
|
||
global _request_counter
|
||
_request_counter += 1
|
||
|
||
async def increment_error_counter():
|
||
"""Увеличение счетчика ошибок"""
|
||
global _error_counter
|
||
_error_counter += 1
|
||
|
||
|
||
# Healthcheck для ready probe (Kubernetes)
|
||
@router.get("/ready")
|
||
async def readiness_check():
|
||
"""
|
||
Проверка готовности к обслуживанию запросов
|
||
Для Kubernetes readiness probe
|
||
"""
|
||
try:
|
||
# Проверяем критически важные сервисы
|
||
checks = []
|
||
|
||
# Проверка базы данных
|
||
try:
|
||
async with db_manager.get_session() as session:
|
||
await session.execute(text("SELECT 1"))
|
||
checks.append({"service": "database", "status": "ready"})
|
||
except Exception as e:
|
||
checks.append({"service": "database", "status": "not_ready", "error": str(e)})
|
||
|
||
# Проверка кэша
|
||
try:
|
||
cache_manager = await get_cache_manager()
|
||
await cache_manager.set("readiness_check", "ok", ttl=5)
|
||
checks.append({"service": "cache", "status": "ready"})
|
||
except Exception as e:
|
||
checks.append({"service": "cache", "status": "not_ready", "error": str(e)})
|
||
|
||
# Определяем готовность
|
||
all_ready = all(check["status"] == "ready" for check in checks)
|
||
|
||
return JSONResponse(
|
||
content={
|
||
"status": "ready" if all_ready else "not_ready",
|
||
"checks": checks,
|
||
"timestamp": datetime.utcnow().isoformat()
|
||
},
|
||
status_code=200 if all_ready else 503
|
||
)
|
||
|
||
except Exception as e:
|
||
return JSONResponse(
|
||
content={
|
||
"status": "not_ready",
|
||
"error": "Readiness check failed",
|
||
"timestamp": datetime.utcnow().isoformat()
|
||
},
|
||
status_code=503
|
||
)
|
||
|
||
|
||
# Liveness probe для Kubernetes
|
||
@router.get("/live")
|
||
async def liveness_check():
|
||
"""
|
||
Проверка жизнеспособности приложения
|
||
Для Kubernetes liveness probe
|
||
"""
|
||
return {
|
||
"status": "alive",
|
||
"timestamp": datetime.utcnow().isoformat(),
|
||
"uptime_seconds": int(time.time() - _start_time)
|
||
} |