feat(alerts): add traffic spike detection with configurable thresholds

Introduce traffic_spike alert type for monitoring system and per-master
traffic levels with configurable thresholds stored in database.

- Add AlertThresholdConfig model for persistent threshold configuration
- Implement GET/PUT /admin/alerts/thresholds endpoints for threshold management
- Add traffic spike detection in alert detector cron job:
  - Global QPS monitoring across all masters
  - Per-master RPM/TPM checks with minimum sample thresholds
  - Per-master RPD/TPD checks for daily limits
- Use warning severity at threshold, critical at 2x threshold
- Include metric metadata (value, threshold, window) in alert details
- Update API documentation with new endpoints and alert type
This commit is contained in:
zenfun
2025-12-31 15:56:17 +08:00
parent 85d91cdd2e
commit ba54abd424
6 changed files with 563 additions and 3 deletions

View File

@@ -143,7 +143,7 @@ func main() {
// Auto Migrate
if logDB != db {
if err := db.AutoMigrate(&model.Master{}, &model.Key{}, &model.ProviderGroup{}, &model.APIKey{}, &model.Model{}, &model.Binding{}, &model.Namespace{}, &model.OperationLog{}, &model.SyncOutbox{}, &model.Alert{}); err != nil {
if err := db.AutoMigrate(&model.Master{}, &model.Key{}, &model.ProviderGroup{}, &model.APIKey{}, &model.Model{}, &model.Binding{}, &model.Namespace{}, &model.OperationLog{}, &model.SyncOutbox{}, &model.Alert{}, &model.AlertThresholdConfig{}); err != nil {
fatal(logger, "failed to auto migrate", "err", err)
}
if err := logDB.AutoMigrate(&model.LogRecord{}); err != nil {
@@ -153,7 +153,7 @@ func main() {
fatal(logger, "failed to ensure log indexes", "err", err)
}
} else {
if err := db.AutoMigrate(&model.Master{}, &model.Key{}, &model.ProviderGroup{}, &model.APIKey{}, &model.Model{}, &model.Binding{}, &model.Namespace{}, &model.OperationLog{}, &model.LogRecord{}, &model.SyncOutbox{}, &model.Alert{}); err != nil {
if err := db.AutoMigrate(&model.Master{}, &model.Key{}, &model.ProviderGroup{}, &model.APIKey{}, &model.Model{}, &model.Binding{}, &model.Namespace{}, &model.OperationLog{}, &model.LogRecord{}, &model.SyncOutbox{}, &model.Alert{}, &model.AlertThresholdConfig{}); err != nil {
fatal(logger, "failed to auto migrate", "err", err)
}
if err := service.EnsureLogIndexes(db); err != nil {
@@ -370,6 +370,8 @@ func main() {
adminGroup.GET("/alerts", alertHandler.ListAlerts)
adminGroup.POST("/alerts", alertHandler.CreateAlert)
adminGroup.GET("/alerts/stats", alertHandler.GetAlertStats)
adminGroup.GET("/alerts/thresholds", alertHandler.GetAlertThresholds)
adminGroup.PUT("/alerts/thresholds", alertHandler.UpdateAlertThresholds)
adminGroup.GET("/alerts/:id", alertHandler.GetAlert)
adminGroup.POST("/alerts/:id/ack", alertHandler.AcknowledgeAlert)
adminGroup.POST("/alerts/:id/resolve", alertHandler.ResolveAlert)