uploader-bot/app/core/stats/metrics_collector.py

194 lines
7.7 KiB
Python

from __future__ import annotations
import asyncio
import logging
import os
import time
from typing import Optional, Tuple
from app.core.models.stats.metrics_models import SystemMetrics, AppMetrics
logger = logging.getLogger(__name__)
def _try_import_psutil():
try:
import psutil # type: ignore
return psutil
except Exception as e:
logger.warning("psutil not available, system metrics will be limited: %s", e)
return None
class MetricsCollector:
"""
Сборщик внутренних метрик:
- System: CPU, RAM, Disk, IO, Network
- App: conversions, requests, errors, slow ops, latency
Хранит только последнюю сессию счетчиков (агрегация истории выполняется в StatsAggregator).
"""
def __init__(self) -> None:
self._psutil = _try_import_psutil()
# App counters
self._total_conversions = 0
self._total_requests = 0
self._total_errors = 0
self._slow_ops_count = 0
# Latency rolling values (экспоненциальная сглаженная средняя для p95/p99 — упрощённо)
self._avg_response_ms: Optional[float] = None
self._p95_response_ms: Optional[float] = None
self._p99_response_ms: Optional[float] = None
# Previous snapshots for rate calculations
self._last_disk_io: Optional[Tuple[int, int, float]] = None # (read_bytes, write_bytes, ts)
self._last_net_io: Optional[Tuple[int, int, float]] = None # (bytes_sent, bytes_recv, ts)
# Uptime
try:
self._start_ts = int(os.getenv("NODE_START_TS", str(int(time.time()))))
except Exception:
self._start_ts = int(time.time())
# Async lock to protect counters
self._lock = asyncio.Lock()
async def collect_system_metrics(self) -> SystemMetrics:
ps = self._psutil
now = time.time()
cpu_percent = None
load1 = load5 = load15 = None
mem_total = mem_used = mem_available = mem_percent = None
disk_total = disk_used = disk_free = disk_percent = None
io_read_mb_s = io_write_mb_s = None
net_sent_kb_s = net_recv_kb_s = None
try:
if ps:
# CPU
cpu_percent = float(ps.cpu_percent(interval=None))
try:
load1, load5, load15 = ps.getloadavg() if hasattr(ps, "getloadavg") else os.getloadavg() # type: ignore
except Exception:
load1 = load5 = load15 = None
# Memory
vm = ps.virtual_memory()
mem_total = round(vm.total / (1024 * 1024), 2)
mem_used = round(vm.used / (1024 * 1024), 2)
mem_available = round(vm.available / (1024 * 1024), 2)
mem_percent = float(vm.percent)
# Disk
du = ps.disk_usage("/")
disk_total = round(du.total / (1024 * 1024), 2)
disk_used = round(du.used / (1024 * 1024), 2)
disk_free = round(du.free / (1024 * 1024), 2)
disk_percent = float(du.percent)
# IO rates
try:
dio = ps.disk_io_counters()
if dio and self._last_disk_io:
last_read, last_write, last_ts = self._last_disk_io
dt = max(now - last_ts, 1e-6)
io_read_mb_s = round((max(dio.read_bytes - last_read, 0) / (1024 * 1024)) / dt, 3)
io_write_mb_s = round((max(dio.write_bytes - last_write, 0) / (1024 * 1024)) / dt, 3)
self._last_disk_io = (dio.read_bytes, dio.write_bytes, now) if dio else self._last_disk_io
except Exception:
io_read_mb_s = io_write_mb_s = None
# NET rates
try:
nio = ps.net_io_counters()
if nio and self._last_net_io:
last_sent, last_recv, last_ts = self._last_net_io
dt = max(now - last_ts, 1e-6)
net_sent_kb_s = round((max(nio.bytes_sent - last_sent, 0) / 1024) / dt, 3)
net_recv_kb_s = round((max(nio.bytes_recv - last_recv, 0) / 1024) / dt, 3)
self._last_net_io = (nio.bytes_sent, nio.bytes_recv, now) if nio else self._last_net_io
except Exception:
net_sent_kb_s = net_recv_kb_s = None
except Exception as e:
logger.exception("collect_system_metrics error: %s", e)
return SystemMetrics(
cpu_percent=cpu_percent,
cpu_load_avg_1m=load1,
cpu_load_avg_5m=load5,
cpu_load_avg_15m=load15,
mem_total_mb=mem_total,
mem_used_mb=mem_used,
mem_available_mb=mem_available,
mem_percent=mem_percent,
disk_total_mb=disk_total,
disk_used_mb=disk_used,
disk_free_mb=disk_free,
disk_percent=disk_percent,
io_read_mb_s=io_read_mb_s,
io_write_mb_s=io_write_mb_s,
net_sent_kb_s=net_sent_kb_s,
net_recv_kb_s=net_recv_kb_s,
uptime_seconds=int(time.time()) - self._start_ts,
)
async def collect_app_metrics(self) -> AppMetrics:
# Снимок текущих счетчиков; агрегирование распределено в StatsAggregator
async with self._lock:
return AppMetrics(
total_conversions=self._total_conversions,
total_requests=self._total_requests,
total_errors=self._total_errors,
slow_ops_count=self._slow_ops_count,
avg_response_ms=self._avg_response_ms,
p95_response_ms=self._p95_response_ms,
p99_response_ms=self._p99_response_ms,
details={}, # можно расширить деталями модулей
)
async def get_current_stats(self) -> Tuple[SystemMetrics, AppMetrics]:
sysm = await self.collect_system_metrics()
appm = await self.collect_app_metrics()
return sysm, appm
# Hooks to update app metrics
async def inc_conversions(self, n: int = 1) -> None:
async with self._lock:
self._total_conversions += n
async def inc_requests(self, n: int = 1) -> None:
async with self._lock:
self._total_requests += n
async def inc_errors(self, n: int = 1) -> None:
async with self._lock:
self._total_errors += n
async def inc_slow_ops(self, n: int = 1) -> None:
async with self._lock:
self._slow_ops_count += n
async def observe_latency_ms(self, value_ms: float) -> None:
"""
Простая статистика латентности:
- EMA для avg
- аппроксимация p95/p99 по взвешенному максимуму (упрощённо, без HDR Histogram)
"""
async with self._lock:
alpha = 0.1
if self._avg_response_ms is None:
self._avg_response_ms = value_ms
else:
self._avg_response_ms = (1 - alpha) * self._avg_response_ms + alpha * value_ms
# Простая аппроксимация квантили при помощи EMA "максимума"
def ema_max(current: Optional[float], x: float, beta: float) -> float:
return x if current is None else max((1 - beta) * current, x)
self._p95_response_ms = ema_max(self._p95_response_ms, value_ms, beta=0.05)
self._p99_response_ms = ema_max(self._p99_response_ms, value_ms, beta=0.01)