194 lines
7.7 KiB
Python
194 lines
7.7 KiB
Python
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import logging
|
|
import os
|
|
import time
|
|
from typing import Optional, Tuple
|
|
|
|
from app.core.models.stats.metrics_models import SystemMetrics, AppMetrics
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def _try_import_psutil():
|
|
try:
|
|
import psutil # type: ignore
|
|
return psutil
|
|
except Exception as e:
|
|
logger.warning("psutil not available, system metrics will be limited: %s", e)
|
|
return None
|
|
|
|
|
|
class MetricsCollector:
|
|
"""
|
|
Сборщик внутренних метрик:
|
|
- System: CPU, RAM, Disk, IO, Network
|
|
- App: conversions, requests, errors, slow ops, latency
|
|
Хранит только последнюю сессию счетчиков (агрегация истории выполняется в StatsAggregator).
|
|
"""
|
|
|
|
def __init__(self) -> None:
|
|
self._psutil = _try_import_psutil()
|
|
|
|
# App counters
|
|
self._total_conversions = 0
|
|
self._total_requests = 0
|
|
self._total_errors = 0
|
|
self._slow_ops_count = 0
|
|
|
|
# Latency rolling values (экспоненциальная сглаженная средняя для p95/p99 — упрощённо)
|
|
self._avg_response_ms: Optional[float] = None
|
|
self._p95_response_ms: Optional[float] = None
|
|
self._p99_response_ms: Optional[float] = None
|
|
|
|
# Previous snapshots for rate calculations
|
|
self._last_disk_io: Optional[Tuple[int, int, float]] = None # (read_bytes, write_bytes, ts)
|
|
self._last_net_io: Optional[Tuple[int, int, float]] = None # (bytes_sent, bytes_recv, ts)
|
|
|
|
# Uptime
|
|
try:
|
|
self._start_ts = int(os.getenv("NODE_START_TS", str(int(time.time()))))
|
|
except Exception:
|
|
self._start_ts = int(time.time())
|
|
|
|
# Async lock to protect counters
|
|
self._lock = asyncio.Lock()
|
|
|
|
async def collect_system_metrics(self) -> SystemMetrics:
|
|
ps = self._psutil
|
|
now = time.time()
|
|
|
|
cpu_percent = None
|
|
load1 = load5 = load15 = None
|
|
mem_total = mem_used = mem_available = mem_percent = None
|
|
disk_total = disk_used = disk_free = disk_percent = None
|
|
io_read_mb_s = io_write_mb_s = None
|
|
net_sent_kb_s = net_recv_kb_s = None
|
|
|
|
try:
|
|
if ps:
|
|
# CPU
|
|
cpu_percent = float(ps.cpu_percent(interval=None))
|
|
try:
|
|
load1, load5, load15 = ps.getloadavg() if hasattr(ps, "getloadavg") else os.getloadavg() # type: ignore
|
|
except Exception:
|
|
load1 = load5 = load15 = None
|
|
|
|
# Memory
|
|
vm = ps.virtual_memory()
|
|
mem_total = round(vm.total / (1024 * 1024), 2)
|
|
mem_used = round(vm.used / (1024 * 1024), 2)
|
|
mem_available = round(vm.available / (1024 * 1024), 2)
|
|
mem_percent = float(vm.percent)
|
|
|
|
# Disk
|
|
du = ps.disk_usage("/")
|
|
disk_total = round(du.total / (1024 * 1024), 2)
|
|
disk_used = round(du.used / (1024 * 1024), 2)
|
|
disk_free = round(du.free / (1024 * 1024), 2)
|
|
disk_percent = float(du.percent)
|
|
|
|
# IO rates
|
|
try:
|
|
dio = ps.disk_io_counters()
|
|
if dio and self._last_disk_io:
|
|
last_read, last_write, last_ts = self._last_disk_io
|
|
dt = max(now - last_ts, 1e-6)
|
|
io_read_mb_s = round((max(dio.read_bytes - last_read, 0) / (1024 * 1024)) / dt, 3)
|
|
io_write_mb_s = round((max(dio.write_bytes - last_write, 0) / (1024 * 1024)) / dt, 3)
|
|
self._last_disk_io = (dio.read_bytes, dio.write_bytes, now) if dio else self._last_disk_io
|
|
except Exception:
|
|
io_read_mb_s = io_write_mb_s = None
|
|
|
|
# NET rates
|
|
try:
|
|
nio = ps.net_io_counters()
|
|
if nio and self._last_net_io:
|
|
last_sent, last_recv, last_ts = self._last_net_io
|
|
dt = max(now - last_ts, 1e-6)
|
|
net_sent_kb_s = round((max(nio.bytes_sent - last_sent, 0) / 1024) / dt, 3)
|
|
net_recv_kb_s = round((max(nio.bytes_recv - last_recv, 0) / 1024) / dt, 3)
|
|
self._last_net_io = (nio.bytes_sent, nio.bytes_recv, now) if nio else self._last_net_io
|
|
except Exception:
|
|
net_sent_kb_s = net_recv_kb_s = None
|
|
|
|
except Exception as e:
|
|
logger.exception("collect_system_metrics error: %s", e)
|
|
|
|
return SystemMetrics(
|
|
cpu_percent=cpu_percent,
|
|
cpu_load_avg_1m=load1,
|
|
cpu_load_avg_5m=load5,
|
|
cpu_load_avg_15m=load15,
|
|
mem_total_mb=mem_total,
|
|
mem_used_mb=mem_used,
|
|
mem_available_mb=mem_available,
|
|
mem_percent=mem_percent,
|
|
disk_total_mb=disk_total,
|
|
disk_used_mb=disk_used,
|
|
disk_free_mb=disk_free,
|
|
disk_percent=disk_percent,
|
|
io_read_mb_s=io_read_mb_s,
|
|
io_write_mb_s=io_write_mb_s,
|
|
net_sent_kb_s=net_sent_kb_s,
|
|
net_recv_kb_s=net_recv_kb_s,
|
|
uptime_seconds=int(time.time()) - self._start_ts,
|
|
)
|
|
|
|
async def collect_app_metrics(self) -> AppMetrics:
|
|
# Снимок текущих счетчиков; агрегирование распределено в StatsAggregator
|
|
async with self._lock:
|
|
return AppMetrics(
|
|
total_conversions=self._total_conversions,
|
|
total_requests=self._total_requests,
|
|
total_errors=self._total_errors,
|
|
slow_ops_count=self._slow_ops_count,
|
|
avg_response_ms=self._avg_response_ms,
|
|
p95_response_ms=self._p95_response_ms,
|
|
p99_response_ms=self._p99_response_ms,
|
|
details={}, # можно расширить деталями модулей
|
|
)
|
|
|
|
async def get_current_stats(self) -> Tuple[SystemMetrics, AppMetrics]:
|
|
sysm = await self.collect_system_metrics()
|
|
appm = await self.collect_app_metrics()
|
|
return sysm, appm
|
|
|
|
# Hooks to update app metrics
|
|
|
|
async def inc_conversions(self, n: int = 1) -> None:
|
|
async with self._lock:
|
|
self._total_conversions += n
|
|
|
|
async def inc_requests(self, n: int = 1) -> None:
|
|
async with self._lock:
|
|
self._total_requests += n
|
|
|
|
async def inc_errors(self, n: int = 1) -> None:
|
|
async with self._lock:
|
|
self._total_errors += n
|
|
|
|
async def inc_slow_ops(self, n: int = 1) -> None:
|
|
async with self._lock:
|
|
self._slow_ops_count += n
|
|
|
|
async def observe_latency_ms(self, value_ms: float) -> None:
|
|
"""
|
|
Простая статистика латентности:
|
|
- EMA для avg
|
|
- аппроксимация p95/p99 по взвешенному максимуму (упрощённо, без HDR Histogram)
|
|
"""
|
|
async with self._lock:
|
|
alpha = 0.1
|
|
if self._avg_response_ms is None:
|
|
self._avg_response_ms = value_ms
|
|
else:
|
|
self._avg_response_ms = (1 - alpha) * self._avg_response_ms + alpha * value_ms
|
|
|
|
# Простая аппроксимация квантили при помощи EMA "максимума"
|
|
def ema_max(current: Optional[float], x: float, beta: float) -> float:
|
|
return x if current is None else max((1 - beta) * current, x)
|
|
|
|
self._p95_response_ms = ema_max(self._p95_response_ms, value_ms, beta=0.05)
|
|
self._p99_response_ms = ema_max(self._p99_response_ms, value_ms, beta=0.01) |