feat(alerts): add traffic spike detection with configurable thresholds

Introduce traffic_spike alert type for monitoring system and per-master
traffic levels with configurable thresholds stored in database.

- Add AlertThresholdConfig model for persistent threshold configuration
- Implement GET/PUT /admin/alerts/thresholds endpoints for threshold management
- Add traffic spike detection in alert detector cron job:
  - Global QPS monitoring across all masters
  - Per-master RPM/TPM checks with minimum sample thresholds
  - Per-master RPD/TPD checks for daily limits
- Use warning severity at threshold, critical at 2x threshold
- Include metric metadata (value, threshold, window) in alert details
- Update API documentation with new endpoints and alert type
This commit is contained in:
zenfun
2025-12-31 15:56:17 +08:00
parent 85d91cdd2e
commit ba54abd424
6 changed files with 563 additions and 3 deletions

View File

@@ -16,6 +16,7 @@ const (
AlertTypeKeyDisabled AlertType = "key_disabled"
AlertTypeKeyExpired AlertType = "key_expired"
AlertTypeProviderDown AlertType = "provider_down"
AlertTypeTrafficSpike AlertType = "traffic_spike"
)
// AlertSeverity defines the severity level of an alert

View File

@@ -0,0 +1,46 @@
package model
import (
"time"
"gorm.io/gorm"
)
// AlertThresholdConfig stores configurable thresholds for traffic spike detection
type AlertThresholdConfig struct {
ID uint `gorm:"primaryKey" json:"id"`
CreatedAt time.Time `json:"created_at"`
UpdatedAt time.Time `json:"updated_at"`
DeletedAt gorm.DeletedAt `gorm:"index" json:"-"`
// Global thresholds
GlobalQPS int64 `gorm:"not null;default:100" json:"global_qps"` // System-wide QPS threshold
// Per-master thresholds
MasterRPM int64 `gorm:"not null;default:20" json:"master_rpm"` // Requests per minute threshold
MasterRPD int64 `gorm:"not null;default:1000" json:"master_rpd"` // Requests per day threshold
MasterTPM int64 `gorm:"not null;default:10000000" json:"master_tpm"` // Tokens per minute threshold
MasterTPD int64 `gorm:"not null;default:100000000" json:"master_tpd"` // Tokens per day threshold
// Minimum sample thresholds for 1-minute window checks
MinRPMRequests1m int64 `gorm:"not null;default:10" json:"min_rpm_requests_1m"` // Min requests in 1m to trigger RPM check
MinTPMTokens1m int64 `gorm:"not null;default:1000000" json:"min_tpm_tokens_1m"` // Min tokens in 1m to trigger TPM check
}
// DefaultAlertThresholdConfig returns the default threshold configuration
func DefaultAlertThresholdConfig() AlertThresholdConfig {
return AlertThresholdConfig{
GlobalQPS: 100,
MasterRPM: 20,
MasterRPD: 1000,
MasterTPM: 10_000_000,
MasterTPD: 100_000_000,
MinRPMRequests1m: 10,
MinTPMTokens1m: 1_000_000,
}
}
// TableName returns the table name for AlertThresholdConfig
func (AlertThresholdConfig) TableName() string {
return "alert_threshold_configs"
}