287 lines
9.0 KiB
YAML
287 lines
9.0 KiB
YAML
# Alert rules for my-uploader-bot monitoring
|
|
|
|
groups:
|
|
- name: application_alerts
|
|
interval: 30s
|
|
rules:
|
|
# Application availability
|
|
- alert: ApplicationDown
|
|
expr: up{job="my-uploader-bot"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
service: my-uploader-bot
|
|
annotations:
|
|
summary: "Application instance {{ $labels.instance }} is down"
|
|
description: "My-uploader-bot application has been down for more than 1 minute"
|
|
|
|
# High error rate
|
|
- alert: HighErrorRate
|
|
expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: my-uploader-bot
|
|
annotations:
|
|
summary: "High error rate detected"
|
|
description: "Error rate is {{ $value | humanizePercentage }} for the last 5 minutes"
|
|
|
|
# High response time
|
|
- alert: HighResponseTime
|
|
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 2.0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: my-uploader-bot
|
|
annotations:
|
|
summary: "High response time detected"
|
|
description: "95th percentile response time is {{ $value }}s for the last 5 minutes"
|
|
|
|
# High memory usage
|
|
- alert: HighMemoryUsage
|
|
expr: (process_resident_memory_bytes / 1024 / 1024 / 1024) > 2.0
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
service: my-uploader-bot
|
|
annotations:
|
|
summary: "High memory usage detected"
|
|
description: "Memory usage is {{ $value | humanize }}GB"
|
|
|
|
# High CPU usage
|
|
- alert: HighCPUUsage
|
|
expr: rate(process_cpu_seconds_total[5m]) * 100 > 80
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
service: my-uploader-bot
|
|
annotations:
|
|
summary: "High CPU usage detected"
|
|
description: "CPU usage is {{ $value | humanizePercentage }}"
|
|
|
|
- name: database_alerts
|
|
interval: 30s
|
|
rules:
|
|
# Database down
|
|
- alert: PostgreSQLDown
|
|
expr: up{job="postgres"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
service: postgresql
|
|
annotations:
|
|
summary: "PostgreSQL instance {{ $labels.instance }} is down"
|
|
description: "PostgreSQL database has been down for more than 1 minute"
|
|
|
|
# High database connections
|
|
- alert: HighDatabaseConnections
|
|
expr: pg_stat_database_numbackends / pg_settings_max_connections > 0.8
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: postgresql
|
|
annotations:
|
|
summary: "High database connection usage"
|
|
description: "Database connection usage is {{ $value | humanizePercentage }}"
|
|
|
|
# Slow queries
|
|
- alert: SlowQueries
|
|
expr: pg_stat_activity_max_tx_duration > 300
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: postgresql
|
|
annotations:
|
|
summary: "Slow database queries detected"
|
|
description: "Longest running query is {{ $value }}s"
|
|
|
|
# Replication lag
|
|
- alert: ReplicationLag
|
|
expr: pg_replication_lag > 60
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: postgresql
|
|
annotations:
|
|
summary: "High replication lag"
|
|
description: "Replication lag is {{ $value }}s"
|
|
|
|
- name: cache_alerts
|
|
interval: 30s
|
|
rules:
|
|
# Redis down
|
|
- alert: RedisDown
|
|
expr: up{job="redis"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
service: redis
|
|
annotations:
|
|
summary: "Redis instance {{ $labels.instance }} is down"
|
|
description: "Redis cache has been down for more than 1 minute"
|
|
|
|
# High memory usage
|
|
- alert: RedisHighMemoryUsage
|
|
expr: redis_memory_used_bytes / redis_memory_max_bytes > 0.9
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: redis
|
|
annotations:
|
|
summary: "Redis high memory usage"
|
|
description: "Redis memory usage is {{ $value | humanizePercentage }}"
|
|
|
|
# High hit rate drop
|
|
- alert: RedisCacheHitRateDrop
|
|
expr: rate(redis_keyspace_hits_total[5m]) / (rate(redis_keyspace_hits_total[5m]) + rate(redis_keyspace_misses_total[5m])) < 0.8
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
service: redis
|
|
annotations:
|
|
summary: "Redis cache hit rate dropped"
|
|
description: "Cache hit rate is {{ $value | humanizePercentage }}"
|
|
|
|
- name: system_alerts
|
|
interval: 30s
|
|
rules:
|
|
# High disk usage
|
|
- alert: HighDiskUsage
|
|
expr: (node_filesystem_size_bytes - node_filesystem_avail_bytes) / node_filesystem_size_bytes > 0.85
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: system
|
|
annotations:
|
|
summary: "High disk usage on {{ $labels.mountpoint }}"
|
|
description: "Disk usage is {{ $value | humanizePercentage }}"
|
|
|
|
# High memory usage
|
|
- alert: HighSystemMemoryUsage
|
|
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.9
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: system
|
|
annotations:
|
|
summary: "High system memory usage"
|
|
description: "System memory usage is {{ $value | humanizePercentage }}"
|
|
|
|
# High load average
|
|
- alert: HighLoadAverage
|
|
expr: node_load15 / count without (cpu, mode) (node_cpu_seconds_total{mode="idle"}) > 2.0
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
service: system
|
|
annotations:
|
|
summary: "High system load"
|
|
description: "15-minute load average is {{ $value }}"
|
|
|
|
- name: storage_alerts
|
|
interval: 60s
|
|
rules:
|
|
# High upload queue
|
|
- alert: HighUploadQueue
|
|
expr: upload_queue_size > 100
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: storage
|
|
annotations:
|
|
summary: "High upload queue size"
|
|
description: "Upload queue has {{ $value }} pending items"
|
|
|
|
# Failed uploads
|
|
- alert: HighFailedUploads
|
|
expr: rate(upload_failures_total[10m]) > 0.1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: storage
|
|
annotations:
|
|
summary: "High upload failure rate"
|
|
description: "Upload failure rate is {{ $value }}/min"
|
|
|
|
# Storage space
|
|
- alert: LowStorageSpace
|
|
expr: storage_available_bytes / storage_total_bytes < 0.1
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
service: storage
|
|
annotations:
|
|
summary: "Low storage space"
|
|
description: "Available storage is {{ $value | humanizePercentage }}"
|
|
|
|
- name: blockchain_alerts
|
|
interval: 60s
|
|
rules:
|
|
# TON service down
|
|
- alert: TONServiceDown
|
|
expr: ton_service_up == 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
service: blockchain
|
|
annotations:
|
|
summary: "TON service is down"
|
|
description: "TON blockchain service has been unavailable for more than 2 minutes"
|
|
|
|
# High transaction failures
|
|
- alert: HighTransactionFailures
|
|
expr: rate(blockchain_transaction_failures_total[10m]) > 0.05
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: blockchain
|
|
annotations:
|
|
summary: "High blockchain transaction failure rate"
|
|
description: "Transaction failure rate is {{ $value }}/min"
|
|
|
|
# Pending transactions
|
|
- alert: HighPendingTransactions
|
|
expr: blockchain_pending_transactions > 50
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
service: blockchain
|
|
annotations:
|
|
summary: "High number of pending transactions"
|
|
description: "{{ $value }} transactions are pending for more than 10 minutes"
|
|
|
|
- name: security_alerts
|
|
interval: 30s
|
|
rules:
|
|
# High login failures
|
|
- alert: HighLoginFailures
|
|
expr: rate(auth_login_failures_total[5m]) > 0.1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: security
|
|
annotations:
|
|
summary: "High login failure rate"
|
|
description: "Login failure rate is {{ $value }}/min"
|
|
|
|
# Rate limit hits
|
|
- alert: HighRateLimitHits
|
|
expr: rate(rate_limit_hits_total[5m]) > 10
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: security
|
|
annotations:
|
|
summary: "High rate limit hits"
|
|
description: "Rate limit hit rate is {{ $value }}/min"
|
|
|
|
# Suspicious activity
|
|
- alert: SuspiciousActivity
|
|
expr: security_suspicious_events > 5
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
service: security
|
|
annotations:
|
|
summary: "Suspicious security activity detected"
|
|
description: "{{ $value }} suspicious events detected in the last minute" |