mirror of
https://github.com/EZ-Api/ez-api.git
synced 2026-01-13 17:47:51 +00:00
feat(alerts): add traffic spike detection with configurable thresholds
Introduce traffic_spike alert type for monitoring system and per-master traffic levels with configurable thresholds stored in database. - Add AlertThresholdConfig model for persistent threshold configuration - Implement GET/PUT /admin/alerts/thresholds endpoints for threshold management - Add traffic spike detection in alert detector cron job: - Global QPS monitoring across all masters - Per-master RPM/TPM checks with minimum sample thresholds - Per-master RPD/TPD checks for daily limits - Use warning severity at threshold, critical at 2x threshold - Include metric metadata (value, threshold, window) in alert details - Update API documentation with new endpoints and alert type
This commit is contained in:
@@ -143,7 +143,7 @@ func main() {
|
|||||||
|
|
||||||
// Auto Migrate
|
// Auto Migrate
|
||||||
if logDB != db {
|
if logDB != db {
|
||||||
if err := db.AutoMigrate(&model.Master{}, &model.Key{}, &model.ProviderGroup{}, &model.APIKey{}, &model.Model{}, &model.Binding{}, &model.Namespace{}, &model.OperationLog{}, &model.SyncOutbox{}, &model.Alert{}); err != nil {
|
if err := db.AutoMigrate(&model.Master{}, &model.Key{}, &model.ProviderGroup{}, &model.APIKey{}, &model.Model{}, &model.Binding{}, &model.Namespace{}, &model.OperationLog{}, &model.SyncOutbox{}, &model.Alert{}, &model.AlertThresholdConfig{}); err != nil {
|
||||||
fatal(logger, "failed to auto migrate", "err", err)
|
fatal(logger, "failed to auto migrate", "err", err)
|
||||||
}
|
}
|
||||||
if err := logDB.AutoMigrate(&model.LogRecord{}); err != nil {
|
if err := logDB.AutoMigrate(&model.LogRecord{}); err != nil {
|
||||||
@@ -153,7 +153,7 @@ func main() {
|
|||||||
fatal(logger, "failed to ensure log indexes", "err", err)
|
fatal(logger, "failed to ensure log indexes", "err", err)
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if err := db.AutoMigrate(&model.Master{}, &model.Key{}, &model.ProviderGroup{}, &model.APIKey{}, &model.Model{}, &model.Binding{}, &model.Namespace{}, &model.OperationLog{}, &model.LogRecord{}, &model.SyncOutbox{}, &model.Alert{}); err != nil {
|
if err := db.AutoMigrate(&model.Master{}, &model.Key{}, &model.ProviderGroup{}, &model.APIKey{}, &model.Model{}, &model.Binding{}, &model.Namespace{}, &model.OperationLog{}, &model.LogRecord{}, &model.SyncOutbox{}, &model.Alert{}, &model.AlertThresholdConfig{}); err != nil {
|
||||||
fatal(logger, "failed to auto migrate", "err", err)
|
fatal(logger, "failed to auto migrate", "err", err)
|
||||||
}
|
}
|
||||||
if err := service.EnsureLogIndexes(db); err != nil {
|
if err := service.EnsureLogIndexes(db); err != nil {
|
||||||
@@ -370,6 +370,8 @@ func main() {
|
|||||||
adminGroup.GET("/alerts", alertHandler.ListAlerts)
|
adminGroup.GET("/alerts", alertHandler.ListAlerts)
|
||||||
adminGroup.POST("/alerts", alertHandler.CreateAlert)
|
adminGroup.POST("/alerts", alertHandler.CreateAlert)
|
||||||
adminGroup.GET("/alerts/stats", alertHandler.GetAlertStats)
|
adminGroup.GET("/alerts/stats", alertHandler.GetAlertStats)
|
||||||
|
adminGroup.GET("/alerts/thresholds", alertHandler.GetAlertThresholds)
|
||||||
|
adminGroup.PUT("/alerts/thresholds", alertHandler.UpdateAlertThresholds)
|
||||||
adminGroup.GET("/alerts/:id", alertHandler.GetAlert)
|
adminGroup.GET("/alerts/:id", alertHandler.GetAlert)
|
||||||
adminGroup.POST("/alerts/:id/ack", alertHandler.AcknowledgeAlert)
|
adminGroup.POST("/alerts/:id/ack", alertHandler.AcknowledgeAlert)
|
||||||
adminGroup.POST("/alerts/:id/resolve", alertHandler.ResolveAlert)
|
adminGroup.POST("/alerts/:id/resolve", alertHandler.ResolveAlert)
|
||||||
|
|||||||
70
docs/api.md
70
docs/api.md
@@ -398,6 +398,7 @@ curl "http://localhost:8080/admin/api-keys?group_id=1&status=suspended" \
|
|||||||
| `key_disabled` | Key 被禁用 |
|
| `key_disabled` | Key 被禁用 |
|
||||||
| `key_expired` | Key 已过期 |
|
| `key_expired` | Key 已过期 |
|
||||||
| `provider_down` | 上游服务不可用 |
|
| `provider_down` | 上游服务不可用 |
|
||||||
|
| `traffic_spike` | 流量阈值超限 |
|
||||||
|
|
||||||
| 严重性 (`severity`) | 说明 |
|
| 严重性 (`severity`) | 说明 |
|
||||||
| :--- | :--- |
|
| :--- | :--- |
|
||||||
@@ -422,6 +423,8 @@ curl "http://localhost:8080/admin/api-keys?group_id=1&status=suspended" \
|
|||||||
| `/admin/alerts/:id/resolve` | POST | 解决告警 |
|
| `/admin/alerts/:id/resolve` | POST | 解决告警 |
|
||||||
| `/admin/alerts/:id` | DELETE | 忽略告警 (软删除) |
|
| `/admin/alerts/:id` | DELETE | 忽略告警 (软删除) |
|
||||||
| `/admin/alerts/stats` | GET | 获取告警统计 |
|
| `/admin/alerts/stats` | GET | 获取告警统计 |
|
||||||
|
| `/admin/alerts/thresholds` | GET | 获取流量阈值配置 |
|
||||||
|
| `/admin/alerts/thresholds` | PUT | 更新流量阈值配置 |
|
||||||
|
|
||||||
#### 创建告警
|
#### 创建告警
|
||||||
```bash
|
```bash
|
||||||
@@ -536,10 +539,16 @@ CP 侧运行后台任务,每分钟自动检测异常并生成告警。
|
|||||||
| 错误飙升 | `error_spike` | info/warning/critical | 近 5 分钟错误率 >= 10%(>=50% 为 critical) |
|
| 错误飙升 | `error_spike` | info/warning/critical | 近 5 分钟错误率 >= 10%(>=50% 为 critical) |
|
||||||
| 配额超限 | `quota_exceeded` | warning/critical | Key 配额使用 >= 90%(达到 100% 为 critical) |
|
| 配额超限 | `quota_exceeded` | warning/critical | Key 配额使用 >= 90%(达到 100% 为 critical) |
|
||||||
| 上游故障 | `provider_down` | critical | API Key 失败率 >= 50% 且失败次数 >= 10 |
|
| 上游故障 | `provider_down` | critical | API Key 失败率 >= 50% 且失败次数 >= 10 |
|
||||||
|
| 全局 QPS | `traffic_spike` | warning/critical | 系统 QPS >= 阈值(>= 2x 阈值为 critical) |
|
||||||
|
| Master RPM | `traffic_spike` | warning/critical | Master 每分钟请求数 >= 阈值(满足最小样本) |
|
||||||
|
| Master TPM | `traffic_spike` | warning/critical | Master 每分钟 Token 数 >= 阈值(满足最小样本) |
|
||||||
|
| Master RPD | `traffic_spike` | warning/critical | Master 每日请求数 >= 阈值 |
|
||||||
|
| Master TPD | `traffic_spike` | warning/critical | Master 每日 Token 数 >= 阈值 |
|
||||||
|
|
||||||
**去重机制**:
|
**去重机制**:
|
||||||
- 基于 `fingerprint`(`type:related_type:related_id`)去重
|
- 基于 `fingerprint`(`type:related_type:related_id`)去重
|
||||||
- 5 分钟内同一 fingerprint 的活跃告警不重复创建
|
- 5 分钟内同一 fingerprint 的活跃告警不重复创建
|
||||||
|
- traffic_spike 告警使用 `type:metric:related_type:related_id` 格式
|
||||||
|
|
||||||
**配置默认值**:
|
**配置默认值**:
|
||||||
```go
|
```go
|
||||||
@@ -551,6 +560,67 @@ ProviderFailThreshold: 10 // 上游失败次数阈值
|
|||||||
DeduplicationCooldown: 5 * time.Minute // 去重冷却期
|
DeduplicationCooldown: 5 * time.Minute // 去重冷却期
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### 6.7 流量阈值配置
|
||||||
|
|
||||||
|
管理员可通过 API 配置流量告警阈值,无需重启服务。
|
||||||
|
|
||||||
|
#### 获取阈值配置
|
||||||
|
```bash
|
||||||
|
curl http://localhost:8080/admin/alerts/thresholds \
|
||||||
|
-H "Authorization: Bearer <admin_token>"
|
||||||
|
```
|
||||||
|
|
||||||
|
**响应示例**:
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"global_qps": 100,
|
||||||
|
"master_rpm": 20,
|
||||||
|
"master_rpd": 1000,
|
||||||
|
"master_tpm": 10000000,
|
||||||
|
"master_tpd": 100000000,
|
||||||
|
"min_rpm_requests_1m": 10,
|
||||||
|
"min_tpm_tokens_1m": 1000000,
|
||||||
|
"updated_at": 1704153600
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 更新阈值配置
|
||||||
|
```bash
|
||||||
|
curl -X PUT http://localhost:8080/admin/alerts/thresholds \
|
||||||
|
-H "Authorization: Bearer <admin_token>" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"global_qps": 200,
|
||||||
|
"master_rpm": 50
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
**阈值字段说明**:
|
||||||
|
|
||||||
|
| 字段 | 默认值 | 说明 |
|
||||||
|
| :--- | :--- | :--- |
|
||||||
|
| `global_qps` | 100 | 系统全局 QPS 阈值 |
|
||||||
|
| `master_rpm` | 20 | 每 Master 每分钟请求数阈值 |
|
||||||
|
| `master_rpd` | 1000 | 每 Master 每日请求数阈值 |
|
||||||
|
| `master_tpm` | 10,000,000 | 每 Master 每分钟 Token 数阈值 |
|
||||||
|
| `master_tpd` | 100,000,000 | 每 Master 每日 Token 数阈值 |
|
||||||
|
| `min_rpm_requests_1m` | 10 | RPM 检测最小样本(低于此值跳过 RPM 检测) |
|
||||||
|
| `min_tpm_tokens_1m` | 1,000,000 | TPM 检测最小样本(低于此值跳过 TPM 检测) |
|
||||||
|
|
||||||
|
**严重性规则**:
|
||||||
|
- `warning`: 值 >= 阈值
|
||||||
|
- `critical`: 值 >= 2x 阈值
|
||||||
|
|
||||||
|
**traffic_spike 告警元数据**:
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"metric": "master_rpm",
|
||||||
|
"value": 150,
|
||||||
|
"threshold": 20,
|
||||||
|
"window": "1m"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## 7. 备注
|
## 7. 备注
|
||||||
|
|||||||
@@ -194,7 +194,7 @@ func (h *AlertHandler) CreateAlert(c *gin.Context) {
|
|||||||
// Validate type
|
// Validate type
|
||||||
validTypes := map[string]bool{
|
validTypes := map[string]bool{
|
||||||
"rate_limit": true, "error_spike": true, "quota_exceeded": true,
|
"rate_limit": true, "error_spike": true, "quota_exceeded": true,
|
||||||
"key_disabled": true, "key_expired": true, "provider_down": true,
|
"key_disabled": true, "key_expired": true, "provider_down": true, "traffic_spike": true,
|
||||||
}
|
}
|
||||||
if !validTypes[req.Type] {
|
if !validTypes[req.Type] {
|
||||||
c.JSON(http.StatusBadRequest, gin.H{"error": "invalid alert type"})
|
c.JSON(http.StatusBadRequest, gin.H{"error": "invalid alert type"})
|
||||||
@@ -403,3 +403,161 @@ func (h *AlertHandler) GetAlertStats(c *gin.Context) {
|
|||||||
Info: info,
|
Info: info,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// AlertThresholdView represents threshold configuration in API responses
|
||||||
|
type AlertThresholdView struct {
|
||||||
|
GlobalQPS int64 `json:"global_qps"`
|
||||||
|
MasterRPM int64 `json:"master_rpm"`
|
||||||
|
MasterRPD int64 `json:"master_rpd"`
|
||||||
|
MasterTPM int64 `json:"master_tpm"`
|
||||||
|
MasterTPD int64 `json:"master_tpd"`
|
||||||
|
MinRPMRequests1m int64 `json:"min_rpm_requests_1m"`
|
||||||
|
MinTPMTokens1m int64 `json:"min_tpm_tokens_1m"`
|
||||||
|
UpdatedAt int64 `json:"updated_at"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func toAlertThresholdView(cfg model.AlertThresholdConfig) AlertThresholdView {
|
||||||
|
return AlertThresholdView{
|
||||||
|
GlobalQPS: cfg.GlobalQPS,
|
||||||
|
MasterRPM: cfg.MasterRPM,
|
||||||
|
MasterRPD: cfg.MasterRPD,
|
||||||
|
MasterTPM: cfg.MasterTPM,
|
||||||
|
MasterTPD: cfg.MasterTPD,
|
||||||
|
MinRPMRequests1m: cfg.MinRPMRequests1m,
|
||||||
|
MinTPMTokens1m: cfg.MinTPMTokens1m,
|
||||||
|
UpdatedAt: cfg.UpdatedAt.UTC().Unix(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetAlertThresholds godoc
|
||||||
|
// @Summary Get alert thresholds
|
||||||
|
// @Description Get current alert threshold configuration for traffic spike detection
|
||||||
|
// @Tags admin
|
||||||
|
// @Produce json
|
||||||
|
// @Security AdminAuth
|
||||||
|
// @Success 200 {object} AlertThresholdView
|
||||||
|
// @Failure 500 {object} gin.H
|
||||||
|
// @Router /admin/alerts/thresholds [get]
|
||||||
|
func (h *AlertHandler) GetAlertThresholds(c *gin.Context) {
|
||||||
|
cfg, err := h.loadThresholdConfig()
|
||||||
|
if err != nil {
|
||||||
|
c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to load thresholds", "details": err.Error()})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
c.JSON(http.StatusOK, toAlertThresholdView(cfg))
|
||||||
|
}
|
||||||
|
|
||||||
|
// UpdateAlertThresholdsRequest is the request body for updating thresholds
|
||||||
|
type UpdateAlertThresholdsRequest struct {
|
||||||
|
GlobalQPS *int64 `json:"global_qps"`
|
||||||
|
MasterRPM *int64 `json:"master_rpm"`
|
||||||
|
MasterRPD *int64 `json:"master_rpd"`
|
||||||
|
MasterTPM *int64 `json:"master_tpm"`
|
||||||
|
MasterTPD *int64 `json:"master_tpd"`
|
||||||
|
MinRPMRequests1m *int64 `json:"min_rpm_requests_1m"`
|
||||||
|
MinTPMTokens1m *int64 `json:"min_tpm_tokens_1m"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// UpdateAlertThresholds godoc
|
||||||
|
// @Summary Update alert thresholds
|
||||||
|
// @Description Update alert threshold configuration for traffic spike detection
|
||||||
|
// @Tags admin
|
||||||
|
// @Accept json
|
||||||
|
// @Produce json
|
||||||
|
// @Security AdminAuth
|
||||||
|
// @Param request body UpdateAlertThresholdsRequest true "Threshold configuration"
|
||||||
|
// @Success 200 {object} AlertThresholdView
|
||||||
|
// @Failure 400 {object} gin.H
|
||||||
|
// @Failure 500 {object} gin.H
|
||||||
|
// @Router /admin/alerts/thresholds [put]
|
||||||
|
func (h *AlertHandler) UpdateAlertThresholds(c *gin.Context) {
|
||||||
|
var req UpdateAlertThresholdsRequest
|
||||||
|
if err := c.ShouldBindJSON(&req); err != nil {
|
||||||
|
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Validate positive values
|
||||||
|
if req.GlobalQPS != nil && *req.GlobalQPS <= 0 {
|
||||||
|
c.JSON(http.StatusBadRequest, gin.H{"error": "global_qps must be positive"})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if req.MasterRPM != nil && *req.MasterRPM <= 0 {
|
||||||
|
c.JSON(http.StatusBadRequest, gin.H{"error": "master_rpm must be positive"})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if req.MasterRPD != nil && *req.MasterRPD <= 0 {
|
||||||
|
c.JSON(http.StatusBadRequest, gin.H{"error": "master_rpd must be positive"})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if req.MasterTPM != nil && *req.MasterTPM <= 0 {
|
||||||
|
c.JSON(http.StatusBadRequest, gin.H{"error": "master_tpm must be positive"})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if req.MasterTPD != nil && *req.MasterTPD <= 0 {
|
||||||
|
c.JSON(http.StatusBadRequest, gin.H{"error": "master_tpd must be positive"})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if req.MinRPMRequests1m != nil && *req.MinRPMRequests1m < 0 {
|
||||||
|
c.JSON(http.StatusBadRequest, gin.H{"error": "min_rpm_requests_1m must be non-negative"})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if req.MinTPMTokens1m != nil && *req.MinTPMTokens1m < 0 {
|
||||||
|
c.JSON(http.StatusBadRequest, gin.H{"error": "min_tpm_tokens_1m must be non-negative"})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
cfg, err := h.loadThresholdConfig()
|
||||||
|
if err != nil {
|
||||||
|
c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to load thresholds", "details": err.Error()})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Apply updates
|
||||||
|
if req.GlobalQPS != nil {
|
||||||
|
cfg.GlobalQPS = *req.GlobalQPS
|
||||||
|
}
|
||||||
|
if req.MasterRPM != nil {
|
||||||
|
cfg.MasterRPM = *req.MasterRPM
|
||||||
|
}
|
||||||
|
if req.MasterRPD != nil {
|
||||||
|
cfg.MasterRPD = *req.MasterRPD
|
||||||
|
}
|
||||||
|
if req.MasterTPM != nil {
|
||||||
|
cfg.MasterTPM = *req.MasterTPM
|
||||||
|
}
|
||||||
|
if req.MasterTPD != nil {
|
||||||
|
cfg.MasterTPD = *req.MasterTPD
|
||||||
|
}
|
||||||
|
if req.MinRPMRequests1m != nil {
|
||||||
|
cfg.MinRPMRequests1m = *req.MinRPMRequests1m
|
||||||
|
}
|
||||||
|
if req.MinTPMTokens1m != nil {
|
||||||
|
cfg.MinTPMTokens1m = *req.MinTPMTokens1m
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := h.db.Save(&cfg).Error; err != nil {
|
||||||
|
c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to save thresholds", "details": err.Error()})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
c.JSON(http.StatusOK, toAlertThresholdView(cfg))
|
||||||
|
}
|
||||||
|
|
||||||
|
// loadThresholdConfig loads the threshold config from DB or returns defaults
|
||||||
|
func (h *AlertHandler) loadThresholdConfig() (model.AlertThresholdConfig, error) {
|
||||||
|
var cfg model.AlertThresholdConfig
|
||||||
|
err := h.db.First(&cfg).Error
|
||||||
|
if err != nil {
|
||||||
|
if err.Error() == "record not found" {
|
||||||
|
// Create default config
|
||||||
|
cfg = model.DefaultAlertThresholdConfig()
|
||||||
|
if createErr := h.db.Create(&cfg).Error; createErr != nil {
|
||||||
|
return cfg, createErr
|
||||||
|
}
|
||||||
|
return cfg, nil
|
||||||
|
}
|
||||||
|
return cfg, err
|
||||||
|
}
|
||||||
|
return cfg, nil
|
||||||
|
}
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ package cron
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
"time"
|
"time"
|
||||||
@@ -100,6 +101,7 @@ func (d *AlertDetector) detectOnce(ctx context.Context) {
|
|||||||
d.detectErrorSpikes(ctx)
|
d.detectErrorSpikes(ctx)
|
||||||
d.detectQuotaExceeded(ctx)
|
d.detectQuotaExceeded(ctx)
|
||||||
d.detectProviderDown(ctx)
|
d.detectProviderDown(ctx)
|
||||||
|
d.detectTrafficSpikes(ctx)
|
||||||
}
|
}
|
||||||
|
|
||||||
// detectRateLimits checks for masters hitting rate limits
|
// detectRateLimits checks for masters hitting rate limits
|
||||||
@@ -323,3 +325,284 @@ func (d *AlertDetector) errorRateSeverity(rate float64) model.AlertSeverity {
|
|||||||
}
|
}
|
||||||
return model.AlertSeverityInfo
|
return model.AlertSeverityInfo
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// loadThresholdConfig loads the threshold config from DB or returns defaults
|
||||||
|
func (d *AlertDetector) loadThresholdConfig() model.AlertThresholdConfig {
|
||||||
|
var cfg model.AlertThresholdConfig
|
||||||
|
err := d.db.First(&cfg).Error
|
||||||
|
if err != nil {
|
||||||
|
return model.DefaultAlertThresholdConfig()
|
||||||
|
}
|
||||||
|
return cfg
|
||||||
|
}
|
||||||
|
|
||||||
|
// trafficSpikeSeverity determines severity based on value vs threshold
|
||||||
|
// warning when >= threshold, critical when >= 2x threshold
|
||||||
|
func trafficSpikeSeverity(value, threshold int64) model.AlertSeverity {
|
||||||
|
if value >= threshold*2 {
|
||||||
|
return model.AlertSeverityCritical
|
||||||
|
}
|
||||||
|
return model.AlertSeverityWarning
|
||||||
|
}
|
||||||
|
|
||||||
|
// trafficSpikeMetadata creates JSON metadata for traffic spike alerts
|
||||||
|
type trafficSpikeMetadata struct {
|
||||||
|
Metric string `json:"metric"`
|
||||||
|
Value int64 `json:"value"`
|
||||||
|
Threshold int64 `json:"threshold"`
|
||||||
|
Window string `json:"window"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m trafficSpikeMetadata) JSON() string {
|
||||||
|
b, _ := json.Marshal(m)
|
||||||
|
return string(b)
|
||||||
|
}
|
||||||
|
|
||||||
|
// detectTrafficSpikes checks for traffic threshold breaches
|
||||||
|
func (d *AlertDetector) detectTrafficSpikes(ctx context.Context) {
|
||||||
|
if d.rdb == nil || d.statsService == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
cfg := d.loadThresholdConfig()
|
||||||
|
|
||||||
|
// 1. Global QPS check
|
||||||
|
d.detectGlobalQPSSpike(ctx, cfg)
|
||||||
|
|
||||||
|
// 2. Per-master RPM/TPM (1-minute window)
|
||||||
|
d.detectMasterMinuteSpikes(ctx, cfg)
|
||||||
|
|
||||||
|
// 3. Per-master RPD/TPD (24-hour window)
|
||||||
|
d.detectMasterDaySpikes(ctx, cfg)
|
||||||
|
}
|
||||||
|
|
||||||
|
// detectGlobalQPSSpike checks global QPS against threshold
|
||||||
|
func (d *AlertDetector) detectGlobalQPSSpike(ctx context.Context, cfg model.AlertThresholdConfig) {
|
||||||
|
// Sum QPS from all active masters
|
||||||
|
var masters []model.Master
|
||||||
|
if err := d.db.Where("status = ?", "active").Find(&masters).Error; err != nil {
|
||||||
|
d.logger.Warn("failed to load masters for global QPS check", "err", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
var totalQPS int64
|
||||||
|
for _, master := range masters {
|
||||||
|
snapshot, err := d.statsService.GetMasterRealtimeSnapshot(ctx, master.ID)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
totalQPS += snapshot.QPS
|
||||||
|
}
|
||||||
|
|
||||||
|
if totalQPS >= cfg.GlobalQPS {
|
||||||
|
meta := trafficSpikeMetadata{
|
||||||
|
Metric: "global_qps",
|
||||||
|
Value: totalQPS,
|
||||||
|
Threshold: cfg.GlobalQPS,
|
||||||
|
Window: "realtime",
|
||||||
|
}
|
||||||
|
d.createTrafficSpikeAlert(
|
||||||
|
trafficSpikeSeverity(totalQPS, cfg.GlobalQPS),
|
||||||
|
fmt.Sprintf("Global QPS threshold exceeded (%d >= %d)", totalQPS, cfg.GlobalQPS),
|
||||||
|
fmt.Sprintf("System-wide QPS is %d, threshold is %d", totalQPS, cfg.GlobalQPS),
|
||||||
|
0, "system", "global",
|
||||||
|
meta,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// detectMasterMinuteSpikes checks per-master RPM/TPM in 1-minute window
|
||||||
|
func (d *AlertDetector) detectMasterMinuteSpikes(ctx context.Context, cfg model.AlertThresholdConfig) {
|
||||||
|
since := time.Now().UTC().Add(-1 * time.Minute)
|
||||||
|
|
||||||
|
// Query aggregated stats per master for 1-minute window
|
||||||
|
type masterStat struct {
|
||||||
|
MasterID uint
|
||||||
|
Requests int64
|
||||||
|
TokensIn int64
|
||||||
|
TokensOut int64
|
||||||
|
}
|
||||||
|
|
||||||
|
var stats []masterStat
|
||||||
|
err := d.logDB.Model(&model.LogRecord{}).
|
||||||
|
Select("master_id, COUNT(*) as requests, COALESCE(SUM(tokens_in), 0) as tokens_in, COALESCE(SUM(tokens_out), 0) as tokens_out").
|
||||||
|
Where("created_at >= ?", since).
|
||||||
|
Where("master_id > 0").
|
||||||
|
Group("master_id").
|
||||||
|
Scan(&stats).Error
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
d.logger.Warn("failed to query master minute stats", "err", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Load master names for alerts
|
||||||
|
masterNames := make(map[uint]string)
|
||||||
|
var masters []model.Master
|
||||||
|
if err := d.db.Select("id, name").Find(&masters).Error; err == nil {
|
||||||
|
for _, m := range masters {
|
||||||
|
masterNames[m.ID] = m.Name
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, stat := range stats {
|
||||||
|
masterName := masterNames[stat.MasterID]
|
||||||
|
if masterName == "" {
|
||||||
|
masterName = fmt.Sprintf("Master#%d", stat.MasterID)
|
||||||
|
}
|
||||||
|
|
||||||
|
// RPM check (with minimum sample threshold)
|
||||||
|
if stat.Requests >= cfg.MinRPMRequests1m && stat.Requests >= cfg.MasterRPM {
|
||||||
|
meta := trafficSpikeMetadata{
|
||||||
|
Metric: "master_rpm",
|
||||||
|
Value: stat.Requests,
|
||||||
|
Threshold: cfg.MasterRPM,
|
||||||
|
Window: "1m",
|
||||||
|
}
|
||||||
|
d.createTrafficSpikeAlert(
|
||||||
|
trafficSpikeSeverity(stat.Requests, cfg.MasterRPM),
|
||||||
|
fmt.Sprintf("Master '%s' RPM threshold exceeded (%d >= %d)", masterName, stat.Requests, cfg.MasterRPM),
|
||||||
|
fmt.Sprintf("Master '%s' (ID: %d) has %d requests in the last minute, threshold is %d", masterName, stat.MasterID, stat.Requests, cfg.MasterRPM),
|
||||||
|
stat.MasterID, "master", masterName,
|
||||||
|
meta,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
// TPM check (with minimum sample threshold)
|
||||||
|
totalTokens := stat.TokensIn + stat.TokensOut
|
||||||
|
if totalTokens >= cfg.MinTPMTokens1m && totalTokens >= cfg.MasterTPM {
|
||||||
|
meta := trafficSpikeMetadata{
|
||||||
|
Metric: "master_tpm",
|
||||||
|
Value: totalTokens,
|
||||||
|
Threshold: cfg.MasterTPM,
|
||||||
|
Window: "1m",
|
||||||
|
}
|
||||||
|
d.createTrafficSpikeAlert(
|
||||||
|
trafficSpikeSeverity(totalTokens, cfg.MasterTPM),
|
||||||
|
fmt.Sprintf("Master '%s' TPM threshold exceeded (%d >= %d)", masterName, totalTokens, cfg.MasterTPM),
|
||||||
|
fmt.Sprintf("Master '%s' (ID: %d) used %d tokens in the last minute, threshold is %d", masterName, stat.MasterID, totalTokens, cfg.MasterTPM),
|
||||||
|
stat.MasterID, "master", masterName,
|
||||||
|
meta,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// detectMasterDaySpikes checks per-master RPD/TPD in 24-hour window
|
||||||
|
func (d *AlertDetector) detectMasterDaySpikes(ctx context.Context, cfg model.AlertThresholdConfig) {
|
||||||
|
since := time.Now().UTC().Add(-24 * time.Hour)
|
||||||
|
|
||||||
|
// Query aggregated stats per master for 24-hour window
|
||||||
|
type masterStat struct {
|
||||||
|
MasterID uint
|
||||||
|
Requests int64
|
||||||
|
TokensIn int64
|
||||||
|
TokensOut int64
|
||||||
|
}
|
||||||
|
|
||||||
|
var stats []masterStat
|
||||||
|
err := d.logDB.Model(&model.LogRecord{}).
|
||||||
|
Select("master_id, COUNT(*) as requests, COALESCE(SUM(tokens_in), 0) as tokens_in, COALESCE(SUM(tokens_out), 0) as tokens_out").
|
||||||
|
Where("created_at >= ?", since).
|
||||||
|
Where("master_id > 0").
|
||||||
|
Group("master_id").
|
||||||
|
Scan(&stats).Error
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
d.logger.Warn("failed to query master day stats", "err", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Load master names for alerts
|
||||||
|
masterNames := make(map[uint]string)
|
||||||
|
var masters []model.Master
|
||||||
|
if err := d.db.Select("id, name").Find(&masters).Error; err == nil {
|
||||||
|
for _, m := range masters {
|
||||||
|
masterNames[m.ID] = m.Name
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, stat := range stats {
|
||||||
|
masterName := masterNames[stat.MasterID]
|
||||||
|
if masterName == "" {
|
||||||
|
masterName = fmt.Sprintf("Master#%d", stat.MasterID)
|
||||||
|
}
|
||||||
|
|
||||||
|
// RPD check
|
||||||
|
if stat.Requests >= cfg.MasterRPD {
|
||||||
|
meta := trafficSpikeMetadata{
|
||||||
|
Metric: "master_rpd",
|
||||||
|
Value: stat.Requests,
|
||||||
|
Threshold: cfg.MasterRPD,
|
||||||
|
Window: "24h",
|
||||||
|
}
|
||||||
|
d.createTrafficSpikeAlert(
|
||||||
|
trafficSpikeSeverity(stat.Requests, cfg.MasterRPD),
|
||||||
|
fmt.Sprintf("Master '%s' RPD threshold exceeded (%d >= %d)", masterName, stat.Requests, cfg.MasterRPD),
|
||||||
|
fmt.Sprintf("Master '%s' (ID: %d) has %d requests in the last 24 hours, threshold is %d", masterName, stat.MasterID, stat.Requests, cfg.MasterRPD),
|
||||||
|
stat.MasterID, "master", masterName,
|
||||||
|
meta,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
// TPD check
|
||||||
|
totalTokens := stat.TokensIn + stat.TokensOut
|
||||||
|
if totalTokens >= cfg.MasterTPD {
|
||||||
|
meta := trafficSpikeMetadata{
|
||||||
|
Metric: "master_tpd",
|
||||||
|
Value: totalTokens,
|
||||||
|
Threshold: cfg.MasterTPD,
|
||||||
|
Window: "24h",
|
||||||
|
}
|
||||||
|
d.createTrafficSpikeAlert(
|
||||||
|
trafficSpikeSeverity(totalTokens, cfg.MasterTPD),
|
||||||
|
fmt.Sprintf("Master '%s' TPD threshold exceeded (%d >= %d)", masterName, totalTokens, cfg.MasterTPD),
|
||||||
|
fmt.Sprintf("Master '%s' (ID: %d) used %d tokens in the last 24 hours, threshold is %d", masterName, stat.MasterID, totalTokens, cfg.MasterTPD),
|
||||||
|
stat.MasterID, "master", masterName,
|
||||||
|
meta,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// createTrafficSpikeAlert creates a traffic_spike alert with metadata
|
||||||
|
func (d *AlertDetector) createTrafficSpikeAlert(
|
||||||
|
severity model.AlertSeverity,
|
||||||
|
title, message string,
|
||||||
|
relatedID uint,
|
||||||
|
relatedType, relatedName string,
|
||||||
|
meta trafficSpikeMetadata,
|
||||||
|
) {
|
||||||
|
fingerprint := fmt.Sprintf("%s:%s:%s:%d", model.AlertTypeTrafficSpike, meta.Metric, relatedType, relatedID)
|
||||||
|
cooldownTime := time.Now().UTC().Add(-d.config.DeduplicationCooldown)
|
||||||
|
|
||||||
|
// Check for existing active alert with same fingerprint
|
||||||
|
var count int64
|
||||||
|
d.db.Model(&model.Alert{}).
|
||||||
|
Where("fingerprint = ? AND status = ? AND created_at >= ?", fingerprint, model.AlertStatusActive, cooldownTime).
|
||||||
|
Count(&count)
|
||||||
|
|
||||||
|
if count > 0 {
|
||||||
|
return // Duplicate within cooldown
|
||||||
|
}
|
||||||
|
|
||||||
|
alert := model.Alert{
|
||||||
|
Type: model.AlertTypeTrafficSpike,
|
||||||
|
Severity: severity,
|
||||||
|
Status: model.AlertStatusActive,
|
||||||
|
Title: title,
|
||||||
|
Message: message,
|
||||||
|
RelatedID: relatedID,
|
||||||
|
RelatedType: relatedType,
|
||||||
|
RelatedName: relatedName,
|
||||||
|
Fingerprint: fingerprint,
|
||||||
|
Metadata: meta.JSON(),
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := d.db.Create(&alert).Error; err != nil {
|
||||||
|
d.logger.Warn("failed to create traffic spike alert", "metric", meta.Metric, "err", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
d.logger.Info("traffic spike alert created", "metric", meta.Metric, "severity", severity, "title", title)
|
||||||
|
}
|
||||||
|
|||||||
@@ -16,6 +16,7 @@ const (
|
|||||||
AlertTypeKeyDisabled AlertType = "key_disabled"
|
AlertTypeKeyDisabled AlertType = "key_disabled"
|
||||||
AlertTypeKeyExpired AlertType = "key_expired"
|
AlertTypeKeyExpired AlertType = "key_expired"
|
||||||
AlertTypeProviderDown AlertType = "provider_down"
|
AlertTypeProviderDown AlertType = "provider_down"
|
||||||
|
AlertTypeTrafficSpike AlertType = "traffic_spike"
|
||||||
)
|
)
|
||||||
|
|
||||||
// AlertSeverity defines the severity level of an alert
|
// AlertSeverity defines the severity level of an alert
|
||||||
|
|||||||
46
internal/model/alert_threshold.go
Normal file
46
internal/model/alert_threshold.go
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
package model
|
||||||
|
|
||||||
|
import (
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"gorm.io/gorm"
|
||||||
|
)
|
||||||
|
|
||||||
|
// AlertThresholdConfig stores configurable thresholds for traffic spike detection
|
||||||
|
type AlertThresholdConfig struct {
|
||||||
|
ID uint `gorm:"primaryKey" json:"id"`
|
||||||
|
CreatedAt time.Time `json:"created_at"`
|
||||||
|
UpdatedAt time.Time `json:"updated_at"`
|
||||||
|
DeletedAt gorm.DeletedAt `gorm:"index" json:"-"`
|
||||||
|
|
||||||
|
// Global thresholds
|
||||||
|
GlobalQPS int64 `gorm:"not null;default:100" json:"global_qps"` // System-wide QPS threshold
|
||||||
|
|
||||||
|
// Per-master thresholds
|
||||||
|
MasterRPM int64 `gorm:"not null;default:20" json:"master_rpm"` // Requests per minute threshold
|
||||||
|
MasterRPD int64 `gorm:"not null;default:1000" json:"master_rpd"` // Requests per day threshold
|
||||||
|
MasterTPM int64 `gorm:"not null;default:10000000" json:"master_tpm"` // Tokens per minute threshold
|
||||||
|
MasterTPD int64 `gorm:"not null;default:100000000" json:"master_tpd"` // Tokens per day threshold
|
||||||
|
|
||||||
|
// Minimum sample thresholds for 1-minute window checks
|
||||||
|
MinRPMRequests1m int64 `gorm:"not null;default:10" json:"min_rpm_requests_1m"` // Min requests in 1m to trigger RPM check
|
||||||
|
MinTPMTokens1m int64 `gorm:"not null;default:1000000" json:"min_tpm_tokens_1m"` // Min tokens in 1m to trigger TPM check
|
||||||
|
}
|
||||||
|
|
||||||
|
// DefaultAlertThresholdConfig returns the default threshold configuration
|
||||||
|
func DefaultAlertThresholdConfig() AlertThresholdConfig {
|
||||||
|
return AlertThresholdConfig{
|
||||||
|
GlobalQPS: 100,
|
||||||
|
MasterRPM: 20,
|
||||||
|
MasterRPD: 1000,
|
||||||
|
MasterTPM: 10_000_000,
|
||||||
|
MasterTPD: 100_000_000,
|
||||||
|
MinRPMRequests1m: 10,
|
||||||
|
MinTPMTokens1m: 1_000_000,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TableName returns the table name for AlertThresholdConfig
|
||||||
|
func (AlertThresholdConfig) TableName() string {
|
||||||
|
return "alert_threshold_configs"
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user