diff --git a/cmd/server/main.go b/cmd/server/main.go index 92000d4..c734208 100644 --- a/cmd/server/main.go +++ b/cmd/server/main.go @@ -143,7 +143,7 @@ func main() { // Auto Migrate if logDB != db { - if err := db.AutoMigrate(&model.Master{}, &model.Key{}, &model.ProviderGroup{}, &model.APIKey{}, &model.Model{}, &model.Binding{}, &model.Namespace{}, &model.OperationLog{}, &model.SyncOutbox{}, &model.Alert{}); err != nil { + if err := db.AutoMigrate(&model.Master{}, &model.Key{}, &model.ProviderGroup{}, &model.APIKey{}, &model.Model{}, &model.Binding{}, &model.Namespace{}, &model.OperationLog{}, &model.SyncOutbox{}, &model.Alert{}, &model.AlertThresholdConfig{}); err != nil { fatal(logger, "failed to auto migrate", "err", err) } if err := logDB.AutoMigrate(&model.LogRecord{}); err != nil { @@ -153,7 +153,7 @@ func main() { fatal(logger, "failed to ensure log indexes", "err", err) } } else { - if err := db.AutoMigrate(&model.Master{}, &model.Key{}, &model.ProviderGroup{}, &model.APIKey{}, &model.Model{}, &model.Binding{}, &model.Namespace{}, &model.OperationLog{}, &model.LogRecord{}, &model.SyncOutbox{}, &model.Alert{}); err != nil { + if err := db.AutoMigrate(&model.Master{}, &model.Key{}, &model.ProviderGroup{}, &model.APIKey{}, &model.Model{}, &model.Binding{}, &model.Namespace{}, &model.OperationLog{}, &model.LogRecord{}, &model.SyncOutbox{}, &model.Alert{}, &model.AlertThresholdConfig{}); err != nil { fatal(logger, "failed to auto migrate", "err", err) } if err := service.EnsureLogIndexes(db); err != nil { @@ -370,6 +370,8 @@ func main() { adminGroup.GET("/alerts", alertHandler.ListAlerts) adminGroup.POST("/alerts", alertHandler.CreateAlert) adminGroup.GET("/alerts/stats", alertHandler.GetAlertStats) + adminGroup.GET("/alerts/thresholds", alertHandler.GetAlertThresholds) + adminGroup.PUT("/alerts/thresholds", alertHandler.UpdateAlertThresholds) adminGroup.GET("/alerts/:id", alertHandler.GetAlert) adminGroup.POST("/alerts/:id/ack", alertHandler.AcknowledgeAlert) adminGroup.POST("/alerts/:id/resolve", alertHandler.ResolveAlert) diff --git a/docs/api.md b/docs/api.md index b92f6f3..3365904 100644 --- a/docs/api.md +++ b/docs/api.md @@ -398,6 +398,7 @@ curl "http://localhost:8080/admin/api-keys?group_id=1&status=suspended" \ | `key_disabled` | Key 被禁用 | | `key_expired` | Key 已过期 | | `provider_down` | 上游服务不可用 | +| `traffic_spike` | 流量阈值超限 | | 严重性 (`severity`) | 说明 | | :--- | :--- | @@ -422,6 +423,8 @@ curl "http://localhost:8080/admin/api-keys?group_id=1&status=suspended" \ | `/admin/alerts/:id/resolve` | POST | 解决告警 | | `/admin/alerts/:id` | DELETE | 忽略告警 (软删除) | | `/admin/alerts/stats` | GET | 获取告警统计 | +| `/admin/alerts/thresholds` | GET | 获取流量阈值配置 | +| `/admin/alerts/thresholds` | PUT | 更新流量阈值配置 | #### 创建告警 ```bash @@ -536,10 +539,16 @@ CP 侧运行后台任务,每分钟自动检测异常并生成告警。 | 错误飙升 | `error_spike` | info/warning/critical | 近 5 分钟错误率 >= 10%(>=50% 为 critical) | | 配额超限 | `quota_exceeded` | warning/critical | Key 配额使用 >= 90%(达到 100% 为 critical) | | 上游故障 | `provider_down` | critical | API Key 失败率 >= 50% 且失败次数 >= 10 | +| 全局 QPS | `traffic_spike` | warning/critical | 系统 QPS >= 阈值(>= 2x 阈值为 critical) | +| Master RPM | `traffic_spike` | warning/critical | Master 每分钟请求数 >= 阈值(满足最小样本) | +| Master TPM | `traffic_spike` | warning/critical | Master 每分钟 Token 数 >= 阈值(满足最小样本) | +| Master RPD | `traffic_spike` | warning/critical | Master 每日请求数 >= 阈值 | +| Master TPD | `traffic_spike` | warning/critical | Master 每日 Token 数 >= 阈值 | **去重机制**: - 基于 `fingerprint`(`type:related_type:related_id`)去重 - 5 分钟内同一 fingerprint 的活跃告警不重复创建 +- traffic_spike 告警使用 `type:metric:related_type:related_id` 格式 **配置默认值**: ```go @@ -551,6 +560,67 @@ ProviderFailThreshold: 10 // 上游失败次数阈值 DeduplicationCooldown: 5 * time.Minute // 去重冷却期 ``` +### 6.7 流量阈值配置 + +管理员可通过 API 配置流量告警阈值,无需重启服务。 + +#### 获取阈值配置 +```bash +curl http://localhost:8080/admin/alerts/thresholds \ + -H "Authorization: Bearer " +``` + +**响应示例**: +```json +{ + "global_qps": 100, + "master_rpm": 20, + "master_rpd": 1000, + "master_tpm": 10000000, + "master_tpd": 100000000, + "min_rpm_requests_1m": 10, + "min_tpm_tokens_1m": 1000000, + "updated_at": 1704153600 +} +``` + +#### 更新阈值配置 +```bash +curl -X PUT http://localhost:8080/admin/alerts/thresholds \ + -H "Authorization: Bearer " \ + -H "Content-Type: application/json" \ + -d '{ + "global_qps": 200, + "master_rpm": 50 + }' +``` + +**阈值字段说明**: + +| 字段 | 默认值 | 说明 | +| :--- | :--- | :--- | +| `global_qps` | 100 | 系统全局 QPS 阈值 | +| `master_rpm` | 20 | 每 Master 每分钟请求数阈值 | +| `master_rpd` | 1000 | 每 Master 每日请求数阈值 | +| `master_tpm` | 10,000,000 | 每 Master 每分钟 Token 数阈值 | +| `master_tpd` | 100,000,000 | 每 Master 每日 Token 数阈值 | +| `min_rpm_requests_1m` | 10 | RPM 检测最小样本(低于此值跳过 RPM 检测) | +| `min_tpm_tokens_1m` | 1,000,000 | TPM 检测最小样本(低于此值跳过 TPM 检测) | + +**严重性规则**: +- `warning`: 值 >= 阈值 +- `critical`: 值 >= 2x 阈值 + +**traffic_spike 告警元数据**: +```json +{ + "metric": "master_rpm", + "value": 150, + "threshold": 20, + "window": "1m" +} +``` + --- ## 7. 备注 diff --git a/internal/api/alert_handler.go b/internal/api/alert_handler.go index 6fd13ba..28452a2 100644 --- a/internal/api/alert_handler.go +++ b/internal/api/alert_handler.go @@ -194,7 +194,7 @@ func (h *AlertHandler) CreateAlert(c *gin.Context) { // Validate type validTypes := map[string]bool{ "rate_limit": true, "error_spike": true, "quota_exceeded": true, - "key_disabled": true, "key_expired": true, "provider_down": true, + "key_disabled": true, "key_expired": true, "provider_down": true, "traffic_spike": true, } if !validTypes[req.Type] { c.JSON(http.StatusBadRequest, gin.H{"error": "invalid alert type"}) @@ -403,3 +403,161 @@ func (h *AlertHandler) GetAlertStats(c *gin.Context) { Info: info, }) } + +// AlertThresholdView represents threshold configuration in API responses +type AlertThresholdView struct { + GlobalQPS int64 `json:"global_qps"` + MasterRPM int64 `json:"master_rpm"` + MasterRPD int64 `json:"master_rpd"` + MasterTPM int64 `json:"master_tpm"` + MasterTPD int64 `json:"master_tpd"` + MinRPMRequests1m int64 `json:"min_rpm_requests_1m"` + MinTPMTokens1m int64 `json:"min_tpm_tokens_1m"` + UpdatedAt int64 `json:"updated_at"` +} + +func toAlertThresholdView(cfg model.AlertThresholdConfig) AlertThresholdView { + return AlertThresholdView{ + GlobalQPS: cfg.GlobalQPS, + MasterRPM: cfg.MasterRPM, + MasterRPD: cfg.MasterRPD, + MasterTPM: cfg.MasterTPM, + MasterTPD: cfg.MasterTPD, + MinRPMRequests1m: cfg.MinRPMRequests1m, + MinTPMTokens1m: cfg.MinTPMTokens1m, + UpdatedAt: cfg.UpdatedAt.UTC().Unix(), + } +} + +// GetAlertThresholds godoc +// @Summary Get alert thresholds +// @Description Get current alert threshold configuration for traffic spike detection +// @Tags admin +// @Produce json +// @Security AdminAuth +// @Success 200 {object} AlertThresholdView +// @Failure 500 {object} gin.H +// @Router /admin/alerts/thresholds [get] +func (h *AlertHandler) GetAlertThresholds(c *gin.Context) { + cfg, err := h.loadThresholdConfig() + if err != nil { + c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to load thresholds", "details": err.Error()}) + return + } + c.JSON(http.StatusOK, toAlertThresholdView(cfg)) +} + +// UpdateAlertThresholdsRequest is the request body for updating thresholds +type UpdateAlertThresholdsRequest struct { + GlobalQPS *int64 `json:"global_qps"` + MasterRPM *int64 `json:"master_rpm"` + MasterRPD *int64 `json:"master_rpd"` + MasterTPM *int64 `json:"master_tpm"` + MasterTPD *int64 `json:"master_tpd"` + MinRPMRequests1m *int64 `json:"min_rpm_requests_1m"` + MinTPMTokens1m *int64 `json:"min_tpm_tokens_1m"` +} + +// UpdateAlertThresholds godoc +// @Summary Update alert thresholds +// @Description Update alert threshold configuration for traffic spike detection +// @Tags admin +// @Accept json +// @Produce json +// @Security AdminAuth +// @Param request body UpdateAlertThresholdsRequest true "Threshold configuration" +// @Success 200 {object} AlertThresholdView +// @Failure 400 {object} gin.H +// @Failure 500 {object} gin.H +// @Router /admin/alerts/thresholds [put] +func (h *AlertHandler) UpdateAlertThresholds(c *gin.Context) { + var req UpdateAlertThresholdsRequest + if err := c.ShouldBindJSON(&req); err != nil { + c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()}) + return + } + + // Validate positive values + if req.GlobalQPS != nil && *req.GlobalQPS <= 0 { + c.JSON(http.StatusBadRequest, gin.H{"error": "global_qps must be positive"}) + return + } + if req.MasterRPM != nil && *req.MasterRPM <= 0 { + c.JSON(http.StatusBadRequest, gin.H{"error": "master_rpm must be positive"}) + return + } + if req.MasterRPD != nil && *req.MasterRPD <= 0 { + c.JSON(http.StatusBadRequest, gin.H{"error": "master_rpd must be positive"}) + return + } + if req.MasterTPM != nil && *req.MasterTPM <= 0 { + c.JSON(http.StatusBadRequest, gin.H{"error": "master_tpm must be positive"}) + return + } + if req.MasterTPD != nil && *req.MasterTPD <= 0 { + c.JSON(http.StatusBadRequest, gin.H{"error": "master_tpd must be positive"}) + return + } + if req.MinRPMRequests1m != nil && *req.MinRPMRequests1m < 0 { + c.JSON(http.StatusBadRequest, gin.H{"error": "min_rpm_requests_1m must be non-negative"}) + return + } + if req.MinTPMTokens1m != nil && *req.MinTPMTokens1m < 0 { + c.JSON(http.StatusBadRequest, gin.H{"error": "min_tpm_tokens_1m must be non-negative"}) + return + } + + cfg, err := h.loadThresholdConfig() + if err != nil { + c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to load thresholds", "details": err.Error()}) + return + } + + // Apply updates + if req.GlobalQPS != nil { + cfg.GlobalQPS = *req.GlobalQPS + } + if req.MasterRPM != nil { + cfg.MasterRPM = *req.MasterRPM + } + if req.MasterRPD != nil { + cfg.MasterRPD = *req.MasterRPD + } + if req.MasterTPM != nil { + cfg.MasterTPM = *req.MasterTPM + } + if req.MasterTPD != nil { + cfg.MasterTPD = *req.MasterTPD + } + if req.MinRPMRequests1m != nil { + cfg.MinRPMRequests1m = *req.MinRPMRequests1m + } + if req.MinTPMTokens1m != nil { + cfg.MinTPMTokens1m = *req.MinTPMTokens1m + } + + if err := h.db.Save(&cfg).Error; err != nil { + c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to save thresholds", "details": err.Error()}) + return + } + + c.JSON(http.StatusOK, toAlertThresholdView(cfg)) +} + +// loadThresholdConfig loads the threshold config from DB or returns defaults +func (h *AlertHandler) loadThresholdConfig() (model.AlertThresholdConfig, error) { + var cfg model.AlertThresholdConfig + err := h.db.First(&cfg).Error + if err != nil { + if err.Error() == "record not found" { + // Create default config + cfg = model.DefaultAlertThresholdConfig() + if createErr := h.db.Create(&cfg).Error; createErr != nil { + return cfg, createErr + } + return cfg, nil + } + return cfg, err + } + return cfg, nil +} diff --git a/internal/cron/alert_detector.go b/internal/cron/alert_detector.go index 94ac9cf..7b4695e 100644 --- a/internal/cron/alert_detector.go +++ b/internal/cron/alert_detector.go @@ -2,6 +2,7 @@ package cron import ( "context" + "encoding/json" "fmt" "log/slog" "time" @@ -100,6 +101,7 @@ func (d *AlertDetector) detectOnce(ctx context.Context) { d.detectErrorSpikes(ctx) d.detectQuotaExceeded(ctx) d.detectProviderDown(ctx) + d.detectTrafficSpikes(ctx) } // detectRateLimits checks for masters hitting rate limits @@ -323,3 +325,284 @@ func (d *AlertDetector) errorRateSeverity(rate float64) model.AlertSeverity { } return model.AlertSeverityInfo } + +// loadThresholdConfig loads the threshold config from DB or returns defaults +func (d *AlertDetector) loadThresholdConfig() model.AlertThresholdConfig { + var cfg model.AlertThresholdConfig + err := d.db.First(&cfg).Error + if err != nil { + return model.DefaultAlertThresholdConfig() + } + return cfg +} + +// trafficSpikeSeverity determines severity based on value vs threshold +// warning when >= threshold, critical when >= 2x threshold +func trafficSpikeSeverity(value, threshold int64) model.AlertSeverity { + if value >= threshold*2 { + return model.AlertSeverityCritical + } + return model.AlertSeverityWarning +} + +// trafficSpikeMetadata creates JSON metadata for traffic spike alerts +type trafficSpikeMetadata struct { + Metric string `json:"metric"` + Value int64 `json:"value"` + Threshold int64 `json:"threshold"` + Window string `json:"window"` +} + +func (m trafficSpikeMetadata) JSON() string { + b, _ := json.Marshal(m) + return string(b) +} + +// detectTrafficSpikes checks for traffic threshold breaches +func (d *AlertDetector) detectTrafficSpikes(ctx context.Context) { + if d.rdb == nil || d.statsService == nil { + return + } + + cfg := d.loadThresholdConfig() + + // 1. Global QPS check + d.detectGlobalQPSSpike(ctx, cfg) + + // 2. Per-master RPM/TPM (1-minute window) + d.detectMasterMinuteSpikes(ctx, cfg) + + // 3. Per-master RPD/TPD (24-hour window) + d.detectMasterDaySpikes(ctx, cfg) +} + +// detectGlobalQPSSpike checks global QPS against threshold +func (d *AlertDetector) detectGlobalQPSSpike(ctx context.Context, cfg model.AlertThresholdConfig) { + // Sum QPS from all active masters + var masters []model.Master + if err := d.db.Where("status = ?", "active").Find(&masters).Error; err != nil { + d.logger.Warn("failed to load masters for global QPS check", "err", err) + return + } + + var totalQPS int64 + for _, master := range masters { + snapshot, err := d.statsService.GetMasterRealtimeSnapshot(ctx, master.ID) + if err != nil { + continue + } + totalQPS += snapshot.QPS + } + + if totalQPS >= cfg.GlobalQPS { + meta := trafficSpikeMetadata{ + Metric: "global_qps", + Value: totalQPS, + Threshold: cfg.GlobalQPS, + Window: "realtime", + } + d.createTrafficSpikeAlert( + trafficSpikeSeverity(totalQPS, cfg.GlobalQPS), + fmt.Sprintf("Global QPS threshold exceeded (%d >= %d)", totalQPS, cfg.GlobalQPS), + fmt.Sprintf("System-wide QPS is %d, threshold is %d", totalQPS, cfg.GlobalQPS), + 0, "system", "global", + meta, + ) + } +} + +// detectMasterMinuteSpikes checks per-master RPM/TPM in 1-minute window +func (d *AlertDetector) detectMasterMinuteSpikes(ctx context.Context, cfg model.AlertThresholdConfig) { + since := time.Now().UTC().Add(-1 * time.Minute) + + // Query aggregated stats per master for 1-minute window + type masterStat struct { + MasterID uint + Requests int64 + TokensIn int64 + TokensOut int64 + } + + var stats []masterStat + err := d.logDB.Model(&model.LogRecord{}). + Select("master_id, COUNT(*) as requests, COALESCE(SUM(tokens_in), 0) as tokens_in, COALESCE(SUM(tokens_out), 0) as tokens_out"). + Where("created_at >= ?", since). + Where("master_id > 0"). + Group("master_id"). + Scan(&stats).Error + + if err != nil { + d.logger.Warn("failed to query master minute stats", "err", err) + return + } + + // Load master names for alerts + masterNames := make(map[uint]string) + var masters []model.Master + if err := d.db.Select("id, name").Find(&masters).Error; err == nil { + for _, m := range masters { + masterNames[m.ID] = m.Name + } + } + + for _, stat := range stats { + masterName := masterNames[stat.MasterID] + if masterName == "" { + masterName = fmt.Sprintf("Master#%d", stat.MasterID) + } + + // RPM check (with minimum sample threshold) + if stat.Requests >= cfg.MinRPMRequests1m && stat.Requests >= cfg.MasterRPM { + meta := trafficSpikeMetadata{ + Metric: "master_rpm", + Value: stat.Requests, + Threshold: cfg.MasterRPM, + Window: "1m", + } + d.createTrafficSpikeAlert( + trafficSpikeSeverity(stat.Requests, cfg.MasterRPM), + fmt.Sprintf("Master '%s' RPM threshold exceeded (%d >= %d)", masterName, stat.Requests, cfg.MasterRPM), + fmt.Sprintf("Master '%s' (ID: %d) has %d requests in the last minute, threshold is %d", masterName, stat.MasterID, stat.Requests, cfg.MasterRPM), + stat.MasterID, "master", masterName, + meta, + ) + } + + // TPM check (with minimum sample threshold) + totalTokens := stat.TokensIn + stat.TokensOut + if totalTokens >= cfg.MinTPMTokens1m && totalTokens >= cfg.MasterTPM { + meta := trafficSpikeMetadata{ + Metric: "master_tpm", + Value: totalTokens, + Threshold: cfg.MasterTPM, + Window: "1m", + } + d.createTrafficSpikeAlert( + trafficSpikeSeverity(totalTokens, cfg.MasterTPM), + fmt.Sprintf("Master '%s' TPM threshold exceeded (%d >= %d)", masterName, totalTokens, cfg.MasterTPM), + fmt.Sprintf("Master '%s' (ID: %d) used %d tokens in the last minute, threshold is %d", masterName, stat.MasterID, totalTokens, cfg.MasterTPM), + stat.MasterID, "master", masterName, + meta, + ) + } + } +} + +// detectMasterDaySpikes checks per-master RPD/TPD in 24-hour window +func (d *AlertDetector) detectMasterDaySpikes(ctx context.Context, cfg model.AlertThresholdConfig) { + since := time.Now().UTC().Add(-24 * time.Hour) + + // Query aggregated stats per master for 24-hour window + type masterStat struct { + MasterID uint + Requests int64 + TokensIn int64 + TokensOut int64 + } + + var stats []masterStat + err := d.logDB.Model(&model.LogRecord{}). + Select("master_id, COUNT(*) as requests, COALESCE(SUM(tokens_in), 0) as tokens_in, COALESCE(SUM(tokens_out), 0) as tokens_out"). + Where("created_at >= ?", since). + Where("master_id > 0"). + Group("master_id"). + Scan(&stats).Error + + if err != nil { + d.logger.Warn("failed to query master day stats", "err", err) + return + } + + // Load master names for alerts + masterNames := make(map[uint]string) + var masters []model.Master + if err := d.db.Select("id, name").Find(&masters).Error; err == nil { + for _, m := range masters { + masterNames[m.ID] = m.Name + } + } + + for _, stat := range stats { + masterName := masterNames[stat.MasterID] + if masterName == "" { + masterName = fmt.Sprintf("Master#%d", stat.MasterID) + } + + // RPD check + if stat.Requests >= cfg.MasterRPD { + meta := trafficSpikeMetadata{ + Metric: "master_rpd", + Value: stat.Requests, + Threshold: cfg.MasterRPD, + Window: "24h", + } + d.createTrafficSpikeAlert( + trafficSpikeSeverity(stat.Requests, cfg.MasterRPD), + fmt.Sprintf("Master '%s' RPD threshold exceeded (%d >= %d)", masterName, stat.Requests, cfg.MasterRPD), + fmt.Sprintf("Master '%s' (ID: %d) has %d requests in the last 24 hours, threshold is %d", masterName, stat.MasterID, stat.Requests, cfg.MasterRPD), + stat.MasterID, "master", masterName, + meta, + ) + } + + // TPD check + totalTokens := stat.TokensIn + stat.TokensOut + if totalTokens >= cfg.MasterTPD { + meta := trafficSpikeMetadata{ + Metric: "master_tpd", + Value: totalTokens, + Threshold: cfg.MasterTPD, + Window: "24h", + } + d.createTrafficSpikeAlert( + trafficSpikeSeverity(totalTokens, cfg.MasterTPD), + fmt.Sprintf("Master '%s' TPD threshold exceeded (%d >= %d)", masterName, totalTokens, cfg.MasterTPD), + fmt.Sprintf("Master '%s' (ID: %d) used %d tokens in the last 24 hours, threshold is %d", masterName, stat.MasterID, totalTokens, cfg.MasterTPD), + stat.MasterID, "master", masterName, + meta, + ) + } + } +} + +// createTrafficSpikeAlert creates a traffic_spike alert with metadata +func (d *AlertDetector) createTrafficSpikeAlert( + severity model.AlertSeverity, + title, message string, + relatedID uint, + relatedType, relatedName string, + meta trafficSpikeMetadata, +) { + fingerprint := fmt.Sprintf("%s:%s:%s:%d", model.AlertTypeTrafficSpike, meta.Metric, relatedType, relatedID) + cooldownTime := time.Now().UTC().Add(-d.config.DeduplicationCooldown) + + // Check for existing active alert with same fingerprint + var count int64 + d.db.Model(&model.Alert{}). + Where("fingerprint = ? AND status = ? AND created_at >= ?", fingerprint, model.AlertStatusActive, cooldownTime). + Count(&count) + + if count > 0 { + return // Duplicate within cooldown + } + + alert := model.Alert{ + Type: model.AlertTypeTrafficSpike, + Severity: severity, + Status: model.AlertStatusActive, + Title: title, + Message: message, + RelatedID: relatedID, + RelatedType: relatedType, + RelatedName: relatedName, + Fingerprint: fingerprint, + Metadata: meta.JSON(), + } + + if err := d.db.Create(&alert).Error; err != nil { + d.logger.Warn("failed to create traffic spike alert", "metric", meta.Metric, "err", err) + return + } + + d.logger.Info("traffic spike alert created", "metric", meta.Metric, "severity", severity, "title", title) +} diff --git a/internal/model/alert.go b/internal/model/alert.go index ddd4849..0e07cc9 100644 --- a/internal/model/alert.go +++ b/internal/model/alert.go @@ -16,6 +16,7 @@ const ( AlertTypeKeyDisabled AlertType = "key_disabled" AlertTypeKeyExpired AlertType = "key_expired" AlertTypeProviderDown AlertType = "provider_down" + AlertTypeTrafficSpike AlertType = "traffic_spike" ) // AlertSeverity defines the severity level of an alert diff --git a/internal/model/alert_threshold.go b/internal/model/alert_threshold.go new file mode 100644 index 0000000..17dbf92 --- /dev/null +++ b/internal/model/alert_threshold.go @@ -0,0 +1,46 @@ +package model + +import ( + "time" + + "gorm.io/gorm" +) + +// AlertThresholdConfig stores configurable thresholds for traffic spike detection +type AlertThresholdConfig struct { + ID uint `gorm:"primaryKey" json:"id"` + CreatedAt time.Time `json:"created_at"` + UpdatedAt time.Time `json:"updated_at"` + DeletedAt gorm.DeletedAt `gorm:"index" json:"-"` + + // Global thresholds + GlobalQPS int64 `gorm:"not null;default:100" json:"global_qps"` // System-wide QPS threshold + + // Per-master thresholds + MasterRPM int64 `gorm:"not null;default:20" json:"master_rpm"` // Requests per minute threshold + MasterRPD int64 `gorm:"not null;default:1000" json:"master_rpd"` // Requests per day threshold + MasterTPM int64 `gorm:"not null;default:10000000" json:"master_tpm"` // Tokens per minute threshold + MasterTPD int64 `gorm:"not null;default:100000000" json:"master_tpd"` // Tokens per day threshold + + // Minimum sample thresholds for 1-minute window checks + MinRPMRequests1m int64 `gorm:"not null;default:10" json:"min_rpm_requests_1m"` // Min requests in 1m to trigger RPM check + MinTPMTokens1m int64 `gorm:"not null;default:1000000" json:"min_tpm_tokens_1m"` // Min tokens in 1m to trigger TPM check +} + +// DefaultAlertThresholdConfig returns the default threshold configuration +func DefaultAlertThresholdConfig() AlertThresholdConfig { + return AlertThresholdConfig{ + GlobalQPS: 100, + MasterRPM: 20, + MasterRPD: 1000, + MasterTPM: 10_000_000, + MasterTPD: 100_000_000, + MinRPMRequests1m: 10, + MinTPMTokens1m: 1_000_000, + } +} + +// TableName returns the table name for AlertThresholdConfig +func (AlertThresholdConfig) TableName() string { + return "alert_threshold_configs" +}