from __future__ import annotations import asyncio import logging import os import time from typing import Optional, Tuple from app.core.models.stats.metrics_models import SystemMetrics, AppMetrics logger = logging.getLogger(__name__) def _try_import_psutil(): try: import psutil # type: ignore return psutil except Exception as e: logger.warning("psutil not available, system metrics will be limited: %s", e) return None class MetricsCollector: """ Сборщик внутренних метрик: - System: CPU, RAM, Disk, IO, Network - App: conversions, requests, errors, slow ops, latency Хранит только последнюю сессию счетчиков (агрегация истории выполняется в StatsAggregator). """ def __init__(self) -> None: self._psutil = _try_import_psutil() # App counters self._total_conversions = 0 self._total_requests = 0 self._total_errors = 0 self._slow_ops_count = 0 # Latency rolling values (экспоненциальная сглаженная средняя для p95/p99 — упрощённо) self._avg_response_ms: Optional[float] = None self._p95_response_ms: Optional[float] = None self._p99_response_ms: Optional[float] = None # Previous snapshots for rate calculations self._last_disk_io: Optional[Tuple[int, int, float]] = None # (read_bytes, write_bytes, ts) self._last_net_io: Optional[Tuple[int, int, float]] = None # (bytes_sent, bytes_recv, ts) # Uptime try: self._start_ts = int(os.getenv("NODE_START_TS", str(int(time.time())))) except Exception: self._start_ts = int(time.time()) # Async lock to protect counters self._lock = asyncio.Lock() async def collect_system_metrics(self) -> SystemMetrics: ps = self._psutil now = time.time() cpu_percent = None load1 = load5 = load15 = None mem_total = mem_used = mem_available = mem_percent = None disk_total = disk_used = disk_free = disk_percent = None io_read_mb_s = io_write_mb_s = None net_sent_kb_s = net_recv_kb_s = None try: if ps: # CPU cpu_percent = float(ps.cpu_percent(interval=None)) try: load1, load5, load15 = ps.getloadavg() if hasattr(ps, "getloadavg") else os.getloadavg() # type: ignore except Exception: load1 = load5 = load15 = None # Memory vm = ps.virtual_memory() mem_total = round(vm.total / (1024 * 1024), 2) mem_used = round(vm.used / (1024 * 1024), 2) mem_available = round(vm.available / (1024 * 1024), 2) mem_percent = float(vm.percent) # Disk du = ps.disk_usage("/") disk_total = round(du.total / (1024 * 1024), 2) disk_used = round(du.used / (1024 * 1024), 2) disk_free = round(du.free / (1024 * 1024), 2) disk_percent = float(du.percent) # IO rates try: dio = ps.disk_io_counters() if dio and self._last_disk_io: last_read, last_write, last_ts = self._last_disk_io dt = max(now - last_ts, 1e-6) io_read_mb_s = round((max(dio.read_bytes - last_read, 0) / (1024 * 1024)) / dt, 3) io_write_mb_s = round((max(dio.write_bytes - last_write, 0) / (1024 * 1024)) / dt, 3) self._last_disk_io = (dio.read_bytes, dio.write_bytes, now) if dio else self._last_disk_io except Exception: io_read_mb_s = io_write_mb_s = None # NET rates try: nio = ps.net_io_counters() if nio and self._last_net_io: last_sent, last_recv, last_ts = self._last_net_io dt = max(now - last_ts, 1e-6) net_sent_kb_s = round((max(nio.bytes_sent - last_sent, 0) / 1024) / dt, 3) net_recv_kb_s = round((max(nio.bytes_recv - last_recv, 0) / 1024) / dt, 3) self._last_net_io = (nio.bytes_sent, nio.bytes_recv, now) if nio else self._last_net_io except Exception: net_sent_kb_s = net_recv_kb_s = None except Exception as e: logger.exception("collect_system_metrics error: %s", e) return SystemMetrics( cpu_percent=cpu_percent, cpu_load_avg_1m=load1, cpu_load_avg_5m=load5, cpu_load_avg_15m=load15, mem_total_mb=mem_total, mem_used_mb=mem_used, mem_available_mb=mem_available, mem_percent=mem_percent, disk_total_mb=disk_total, disk_used_mb=disk_used, disk_free_mb=disk_free, disk_percent=disk_percent, io_read_mb_s=io_read_mb_s, io_write_mb_s=io_write_mb_s, net_sent_kb_s=net_sent_kb_s, net_recv_kb_s=net_recv_kb_s, uptime_seconds=int(time.time()) - self._start_ts, ) async def collect_app_metrics(self) -> AppMetrics: # Снимок текущих счетчиков; агрегирование распределено в StatsAggregator async with self._lock: return AppMetrics( total_conversions=self._total_conversions, total_requests=self._total_requests, total_errors=self._total_errors, slow_ops_count=self._slow_ops_count, avg_response_ms=self._avg_response_ms, p95_response_ms=self._p95_response_ms, p99_response_ms=self._p99_response_ms, details={}, # можно расширить деталями модулей ) async def get_current_stats(self) -> Tuple[SystemMetrics, AppMetrics]: sysm = await self.collect_system_metrics() appm = await self.collect_app_metrics() return sysm, appm # Hooks to update app metrics async def inc_conversions(self, n: int = 1) -> None: async with self._lock: self._total_conversions += n async def inc_requests(self, n: int = 1) -> None: async with self._lock: self._total_requests += n async def inc_errors(self, n: int = 1) -> None: async with self._lock: self._total_errors += n async def inc_slow_ops(self, n: int = 1) -> None: async with self._lock: self._slow_ops_count += n async def observe_latency_ms(self, value_ms: float) -> None: """ Простая статистика латентности: - EMA для avg - аппроксимация p95/p99 по взвешенному максимуму (упрощённо, без HDR Histogram) """ async with self._lock: alpha = 0.1 if self._avg_response_ms is None: self._avg_response_ms = value_ms else: self._avg_response_ms = (1 - alpha) * self._avg_response_ms + alpha * value_ms # Простая аппроксимация квантили при помощи EMA "максимума" def ema_max(current: Optional[float], x: float, beta: float) -> float: return x if current is None else max((1 - beta) * current, x) self._p95_response_ms = ema_max(self._p95_response_ms, value_ms, beta=0.05) self._p99_response_ms = ema_max(self._p99_response_ms, value_ms, beta=0.01)