# Alert rules for my-uploader-bot monitoring groups: - name: application_alerts interval: 30s rules: # Application availability - alert: ApplicationDown expr: up{job="my-uploader-bot"} == 0 for: 1m labels: severity: critical service: my-uploader-bot annotations: summary: "Application instance {{ $labels.instance }} is down" description: "My-uploader-bot application has been down for more than 1 minute" # High error rate - alert: HighErrorRate expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.1 for: 5m labels: severity: warning service: my-uploader-bot annotations: summary: "High error rate detected" description: "Error rate is {{ $value | humanizePercentage }} for the last 5 minutes" # High response time - alert: HighResponseTime expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 2.0 for: 5m labels: severity: warning service: my-uploader-bot annotations: summary: "High response time detected" description: "95th percentile response time is {{ $value }}s for the last 5 minutes" # High memory usage - alert: HighMemoryUsage expr: (process_resident_memory_bytes / 1024 / 1024 / 1024) > 2.0 for: 10m labels: severity: warning service: my-uploader-bot annotations: summary: "High memory usage detected" description: "Memory usage is {{ $value | humanize }}GB" # High CPU usage - alert: HighCPUUsage expr: rate(process_cpu_seconds_total[5m]) * 100 > 80 for: 10m labels: severity: warning service: my-uploader-bot annotations: summary: "High CPU usage detected" description: "CPU usage is {{ $value | humanizePercentage }}" - name: database_alerts interval: 30s rules: # Database down - alert: PostgreSQLDown expr: up{job="postgres"} == 0 for: 1m labels: severity: critical service: postgresql annotations: summary: "PostgreSQL instance {{ $labels.instance }} is down" description: "PostgreSQL database has been down for more than 1 minute" # High database connections - alert: HighDatabaseConnections expr: pg_stat_database_numbackends / pg_settings_max_connections > 0.8 for: 5m labels: severity: warning service: postgresql annotations: summary: "High database connection usage" description: "Database connection usage is {{ $value | humanizePercentage }}" # Slow queries - alert: SlowQueries expr: pg_stat_activity_max_tx_duration > 300 for: 5m labels: severity: warning service: postgresql annotations: summary: "Slow database queries detected" description: "Longest running query is {{ $value }}s" # Replication lag - alert: ReplicationLag expr: pg_replication_lag > 60 for: 5m labels: severity: warning service: postgresql annotations: summary: "High replication lag" description: "Replication lag is {{ $value }}s" - name: cache_alerts interval: 30s rules: # Redis down - alert: RedisDown expr: up{job="redis"} == 0 for: 1m labels: severity: critical service: redis annotations: summary: "Redis instance {{ $labels.instance }} is down" description: "Redis cache has been down for more than 1 minute" # High memory usage - alert: RedisHighMemoryUsage expr: redis_memory_used_bytes / redis_memory_max_bytes > 0.9 for: 5m labels: severity: warning service: redis annotations: summary: "Redis high memory usage" description: "Redis memory usage is {{ $value | humanizePercentage }}" # High hit rate drop - alert: RedisCacheHitRateDrop expr: rate(redis_keyspace_hits_total[5m]) / (rate(redis_keyspace_hits_total[5m]) + rate(redis_keyspace_misses_total[5m])) < 0.8 for: 10m labels: severity: warning service: redis annotations: summary: "Redis cache hit rate dropped" description: "Cache hit rate is {{ $value | humanizePercentage }}" - name: system_alerts interval: 30s rules: # High disk usage - alert: HighDiskUsage expr: (node_filesystem_size_bytes - node_filesystem_avail_bytes) / node_filesystem_size_bytes > 0.85 for: 5m labels: severity: warning service: system annotations: summary: "High disk usage on {{ $labels.mountpoint }}" description: "Disk usage is {{ $value | humanizePercentage }}" # High memory usage - alert: HighSystemMemoryUsage expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.9 for: 5m labels: severity: warning service: system annotations: summary: "High system memory usage" description: "System memory usage is {{ $value | humanizePercentage }}" # High load average - alert: HighLoadAverage expr: node_load15 / count without (cpu, mode) (node_cpu_seconds_total{mode="idle"}) > 2.0 for: 10m labels: severity: warning service: system annotations: summary: "High system load" description: "15-minute load average is {{ $value }}" - name: storage_alerts interval: 60s rules: # High upload queue - alert: HighUploadQueue expr: upload_queue_size > 100 for: 5m labels: severity: warning service: storage annotations: summary: "High upload queue size" description: "Upload queue has {{ $value }} pending items" # Failed uploads - alert: HighFailedUploads expr: rate(upload_failures_total[10m]) > 0.1 for: 5m labels: severity: warning service: storage annotations: summary: "High upload failure rate" description: "Upload failure rate is {{ $value }}/min" # Storage space - alert: LowStorageSpace expr: storage_available_bytes / storage_total_bytes < 0.1 for: 5m labels: severity: critical service: storage annotations: summary: "Low storage space" description: "Available storage is {{ $value | humanizePercentage }}" - name: blockchain_alerts interval: 60s rules: # TON service down - alert: TONServiceDown expr: ton_service_up == 0 for: 2m labels: severity: critical service: blockchain annotations: summary: "TON service is down" description: "TON blockchain service has been unavailable for more than 2 minutes" # High transaction failures - alert: HighTransactionFailures expr: rate(blockchain_transaction_failures_total[10m]) > 0.05 for: 5m labels: severity: warning service: blockchain annotations: summary: "High blockchain transaction failure rate" description: "Transaction failure rate is {{ $value }}/min" # Pending transactions - alert: HighPendingTransactions expr: blockchain_pending_transactions > 50 for: 10m labels: severity: warning service: blockchain annotations: summary: "High number of pending transactions" description: "{{ $value }} transactions are pending for more than 10 minutes" - name: security_alerts interval: 30s rules: # High login failures - alert: HighLoginFailures expr: rate(auth_login_failures_total[5m]) > 0.1 for: 5m labels: severity: warning service: security annotations: summary: "High login failure rate" description: "Login failure rate is {{ $value }}/min" # Rate limit hits - alert: HighRateLimitHits expr: rate(rate_limit_hits_total[5m]) > 10 for: 5m labels: severity: warning service: security annotations: summary: "High rate limit hits" description: "Rate limit hit rate is {{ $value }}/min" # Suspicious activity - alert: SuspiciousActivity expr: security_suspicious_events > 5 for: 1m labels: severity: critical service: security annotations: summary: "Suspicious security activity detected" description: "{{ $value }} suspicious events detected in the last minute"