Files
ez-api/internal/api/alert_handler.go
zenfun ba54abd424 feat(alerts): add traffic spike detection with configurable thresholds
Introduce traffic_spike alert type for monitoring system and per-master
traffic levels with configurable thresholds stored in database.

- Add AlertThresholdConfig model for persistent threshold configuration
- Implement GET/PUT /admin/alerts/thresholds endpoints for threshold management
- Add traffic spike detection in alert detector cron job:
  - Global QPS monitoring across all masters
  - Per-master RPM/TPM checks with minimum sample thresholds
  - Per-master RPD/TPD checks for daily limits
- Use warning severity at threshold, critical at 2x threshold
- Include metric metadata (value, threshold, window) in alert details
- Update API documentation with new endpoints and alert type
2025-12-31 15:56:17 +08:00

564 lines
17 KiB
Go

package api
import (
"net/http"
"strings"
"time"
"github.com/ez-api/ez-api/internal/model"
"github.com/gin-gonic/gin"
"gorm.io/gorm"
)
// AlertHandler handles alert-related API endpoints
type AlertHandler struct {
db *gorm.DB
}
// NewAlertHandler creates a new AlertHandler
func NewAlertHandler(db *gorm.DB) *AlertHandler {
return &AlertHandler{db: db}
}
// AlertView represents an alert in API responses
type AlertView struct {
ID uint `json:"id"`
Type string `json:"type"`
Severity string `json:"severity"`
Status string `json:"status"`
Title string `json:"title"`
Message string `json:"message"`
RelatedID uint `json:"related_id,omitempty"`
RelatedType string `json:"related_type,omitempty"`
RelatedName string `json:"related_name,omitempty"`
Metadata string `json:"metadata,omitempty"`
AckedAt *int64 `json:"acked_at,omitempty"`
AckedBy string `json:"acked_by,omitempty"`
ResolvedAt *int64 `json:"resolved_at,omitempty"`
ExpiresAt *int64 `json:"expires_at,omitempty"`
CreatedAt int64 `json:"created_at"`
UpdatedAt int64 `json:"updated_at"`
}
func toAlertView(a model.Alert) AlertView {
view := AlertView{
ID: a.ID,
Type: string(a.Type),
Severity: string(a.Severity),
Status: string(a.Status),
Title: a.Title,
Message: a.Message,
RelatedID: a.RelatedID,
RelatedType: a.RelatedType,
RelatedName: a.RelatedName,
Metadata: a.Metadata,
AckedBy: a.AckedBy,
CreatedAt: a.CreatedAt.UTC().Unix(),
UpdatedAt: a.UpdatedAt.UTC().Unix(),
}
if a.AckedAt != nil {
ts := a.AckedAt.UTC().Unix()
view.AckedAt = &ts
}
if a.ResolvedAt != nil {
ts := a.ResolvedAt.UTC().Unix()
view.ResolvedAt = &ts
}
if a.ExpiresAt != nil {
ts := a.ExpiresAt.UTC().Unix()
view.ExpiresAt = &ts
}
return view
}
// ListAlertsResponse is the response for listing alerts
type ListAlertsResponse struct {
Total int64 `json:"total"`
Limit int `json:"limit"`
Offset int `json:"offset"`
Items []AlertView `json:"items"`
}
// ListAlerts godoc
// @Summary List alerts
// @Description List system alerts with optional filters
// @Tags admin
// @Produce json
// @Security AdminAuth
// @Param limit query int false "limit (default 50, max 200)"
// @Param offset query int false "offset"
// @Param status query string false "filter by status (active, acknowledged, resolved, dismissed)"
// @Param severity query string false "filter by severity (info, warning, critical)"
// @Param type query string false "filter by type (rate_limit, error_spike, quota_exceeded, key_disabled, key_expired, provider_down)"
// @Success 200 {object} ListAlertsResponse
// @Failure 500 {object} gin.H
// @Router /admin/alerts [get]
func (h *AlertHandler) ListAlerts(c *gin.Context) {
limit, offset := parseLimitOffset(c)
q := h.db.Model(&model.Alert{}).Order("id desc")
if status := strings.TrimSpace(c.Query("status")); status != "" {
q = q.Where("status = ?", status)
}
if severity := strings.TrimSpace(c.Query("severity")); severity != "" {
q = q.Where("severity = ?", severity)
}
if alertType := strings.TrimSpace(c.Query("type")); alertType != "" {
q = q.Where("type = ?", alertType)
}
var total int64
if err := q.Count(&total).Error; err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to count alerts", "details": err.Error()})
return
}
var alerts []model.Alert
if err := q.Limit(limit).Offset(offset).Find(&alerts).Error; err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to list alerts", "details": err.Error()})
return
}
items := make([]AlertView, 0, len(alerts))
for _, a := range alerts {
items = append(items, toAlertView(a))
}
c.JSON(http.StatusOK, ListAlertsResponse{
Total: total,
Limit: limit,
Offset: offset,
Items: items,
})
}
// GetAlert godoc
// @Summary Get alert
// @Description Get a single alert by ID
// @Tags admin
// @Produce json
// @Security AdminAuth
// @Param id path int true "Alert ID"
// @Success 200 {object} AlertView
// @Failure 400 {object} gin.H
// @Failure 404 {object} gin.H
// @Router /admin/alerts/{id} [get]
func (h *AlertHandler) GetAlert(c *gin.Context) {
id, ok := parseUintParam(c, "id")
if !ok {
return
}
var alert model.Alert
if err := h.db.First(&alert, id).Error; err != nil {
c.JSON(http.StatusNotFound, gin.H{"error": "alert not found"})
return
}
c.JSON(http.StatusOK, toAlertView(alert))
}
// CreateAlertRequest is the request body for creating an alert
type CreateAlertRequest struct {
Type string `json:"type" binding:"required"`
Severity string `json:"severity" binding:"required"`
Title string `json:"title" binding:"required"`
Message string `json:"message"`
RelatedID uint `json:"related_id"`
RelatedType string `json:"related_type"`
RelatedName string `json:"related_name"`
Metadata string `json:"metadata"`
ExpiresAt *int64 `json:"expires_at"`
}
// CreateAlert godoc
// @Summary Create alert
// @Description Create a new system alert
// @Tags admin
// @Accept json
// @Produce json
// @Security AdminAuth
// @Param request body CreateAlertRequest true "Alert data"
// @Success 201 {object} AlertView
// @Failure 400 {object} gin.H
// @Failure 500 {object} gin.H
// @Router /admin/alerts [post]
func (h *AlertHandler) CreateAlert(c *gin.Context) {
var req CreateAlertRequest
if err := c.ShouldBindJSON(&req); err != nil {
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
return
}
// Validate type
validTypes := map[string]bool{
"rate_limit": true, "error_spike": true, "quota_exceeded": true,
"key_disabled": true, "key_expired": true, "provider_down": true, "traffic_spike": true,
}
if !validTypes[req.Type] {
c.JSON(http.StatusBadRequest, gin.H{"error": "invalid alert type"})
return
}
// Validate severity
validSeverities := map[string]bool{"info": true, "warning": true, "critical": true}
if !validSeverities[req.Severity] {
c.JSON(http.StatusBadRequest, gin.H{"error": "invalid severity"})
return
}
alert := model.Alert{
Type: model.AlertType(req.Type),
Severity: model.AlertSeverity(req.Severity),
Status: model.AlertStatusActive,
Title: strings.TrimSpace(req.Title),
Message: strings.TrimSpace(req.Message),
RelatedID: req.RelatedID,
RelatedType: strings.TrimSpace(req.RelatedType),
RelatedName: strings.TrimSpace(req.RelatedName),
Metadata: req.Metadata,
}
if req.ExpiresAt != nil && *req.ExpiresAt > 0 {
t := time.Unix(*req.ExpiresAt, 0).UTC()
alert.ExpiresAt = &t
}
if err := h.db.Create(&alert).Error; err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to create alert", "details": err.Error()})
return
}
c.JSON(http.StatusCreated, toAlertView(alert))
}
// AckAlertRequest is the request body for acknowledging an alert
type AckAlertRequest struct {
AckedBy string `json:"acked_by"`
}
// AcknowledgeAlert godoc
// @Summary Acknowledge alert
// @Description Mark an alert as acknowledged
// @Tags admin
// @Accept json
// @Produce json
// @Security AdminAuth
// @Param id path int true "Alert ID"
// @Param request body AckAlertRequest false "Ack data"
// @Success 200 {object} AlertView
// @Failure 400 {object} gin.H
// @Failure 404 {object} gin.H
// @Failure 500 {object} gin.H
// @Router /admin/alerts/{id}/ack [post]
func (h *AlertHandler) AcknowledgeAlert(c *gin.Context) {
id, ok := parseUintParam(c, "id")
if !ok {
return
}
var alert model.Alert
if err := h.db.First(&alert, id).Error; err != nil {
c.JSON(http.StatusNotFound, gin.H{"error": "alert not found"})
return
}
var req AckAlertRequest
_ = c.ShouldBindJSON(&req)
now := time.Now().UTC()
update := map[string]any{
"status": model.AlertStatusAcknowledged,
"acked_at": now,
"acked_by": strings.TrimSpace(req.AckedBy),
}
if err := h.db.Model(&alert).Updates(update).Error; err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to acknowledge alert", "details": err.Error()})
return
}
if err := h.db.First(&alert, id).Error; err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to reload alert", "details": err.Error()})
return
}
c.JSON(http.StatusOK, toAlertView(alert))
}
// ResolveAlert godoc
// @Summary Resolve alert
// @Description Mark an alert as resolved
// @Tags admin
// @Produce json
// @Security AdminAuth
// @Param id path int true "Alert ID"
// @Success 200 {object} AlertView
// @Failure 400 {object} gin.H
// @Failure 404 {object} gin.H
// @Failure 500 {object} gin.H
// @Router /admin/alerts/{id}/resolve [post]
func (h *AlertHandler) ResolveAlert(c *gin.Context) {
id, ok := parseUintParam(c, "id")
if !ok {
return
}
var alert model.Alert
if err := h.db.First(&alert, id).Error; err != nil {
c.JSON(http.StatusNotFound, gin.H{"error": "alert not found"})
return
}
now := time.Now().UTC()
update := map[string]any{
"status": model.AlertStatusResolved,
"resolved_at": now,
}
if err := h.db.Model(&alert).Updates(update).Error; err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to resolve alert", "details": err.Error()})
return
}
if err := h.db.First(&alert, id).Error; err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to reload alert", "details": err.Error()})
return
}
c.JSON(http.StatusOK, toAlertView(alert))
}
// DismissAlert godoc
// @Summary Dismiss alert
// @Description Dismiss an alert (soft delete)
// @Tags admin
// @Produce json
// @Security AdminAuth
// @Param id path int true "Alert ID"
// @Success 200 {object} gin.H
// @Failure 400 {object} gin.H
// @Failure 404 {object} gin.H
// @Failure 500 {object} gin.H
// @Router /admin/alerts/{id} [delete]
func (h *AlertHandler) DismissAlert(c *gin.Context) {
id, ok := parseUintParam(c, "id")
if !ok {
return
}
var alert model.Alert
if err := h.db.First(&alert, id).Error; err != nil {
c.JSON(http.StatusNotFound, gin.H{"error": "alert not found"})
return
}
if err := h.db.Model(&alert).Update("status", model.AlertStatusDismissed).Error; err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to dismiss alert", "details": err.Error()})
return
}
c.JSON(http.StatusOK, gin.H{"status": "dismissed"})
}
// AlertStats represents alert statistics
type AlertStats struct {
Total int64 `json:"total"`
Active int64 `json:"active"`
Acknowledged int64 `json:"acknowledged"`
Resolved int64 `json:"resolved"`
Critical int64 `json:"critical"`
Warning int64 `json:"warning"`
Info int64 `json:"info"`
}
// GetAlertStats godoc
// @Summary Alert statistics
// @Description Get alert count statistics by status and severity
// @Tags admin
// @Produce json
// @Security AdminAuth
// @Success 200 {object} AlertStats
// @Failure 500 {object} gin.H
// @Router /admin/alerts/stats [get]
func (h *AlertHandler) GetAlertStats(c *gin.Context) {
var total, active, acknowledged, resolved, critical, warning, info int64
h.db.Model(&model.Alert{}).Count(&total)
h.db.Model(&model.Alert{}).Where("status = ?", "active").Count(&active)
h.db.Model(&model.Alert{}).Where("status = ?", "acknowledged").Count(&acknowledged)
h.db.Model(&model.Alert{}).Where("status = ?", "resolved").Count(&resolved)
h.db.Model(&model.Alert{}).Where("severity = ? AND status = ?", "critical", "active").Count(&critical)
h.db.Model(&model.Alert{}).Where("severity = ? AND status = ?", "warning", "active").Count(&warning)
h.db.Model(&model.Alert{}).Where("severity = ? AND status = ?", "info", "active").Count(&info)
c.JSON(http.StatusOK, AlertStats{
Total: total,
Active: active,
Acknowledged: acknowledged,
Resolved: resolved,
Critical: critical,
Warning: warning,
Info: info,
})
}
// AlertThresholdView represents threshold configuration in API responses
type AlertThresholdView struct {
GlobalQPS int64 `json:"global_qps"`
MasterRPM int64 `json:"master_rpm"`
MasterRPD int64 `json:"master_rpd"`
MasterTPM int64 `json:"master_tpm"`
MasterTPD int64 `json:"master_tpd"`
MinRPMRequests1m int64 `json:"min_rpm_requests_1m"`
MinTPMTokens1m int64 `json:"min_tpm_tokens_1m"`
UpdatedAt int64 `json:"updated_at"`
}
func toAlertThresholdView(cfg model.AlertThresholdConfig) AlertThresholdView {
return AlertThresholdView{
GlobalQPS: cfg.GlobalQPS,
MasterRPM: cfg.MasterRPM,
MasterRPD: cfg.MasterRPD,
MasterTPM: cfg.MasterTPM,
MasterTPD: cfg.MasterTPD,
MinRPMRequests1m: cfg.MinRPMRequests1m,
MinTPMTokens1m: cfg.MinTPMTokens1m,
UpdatedAt: cfg.UpdatedAt.UTC().Unix(),
}
}
// GetAlertThresholds godoc
// @Summary Get alert thresholds
// @Description Get current alert threshold configuration for traffic spike detection
// @Tags admin
// @Produce json
// @Security AdminAuth
// @Success 200 {object} AlertThresholdView
// @Failure 500 {object} gin.H
// @Router /admin/alerts/thresholds [get]
func (h *AlertHandler) GetAlertThresholds(c *gin.Context) {
cfg, err := h.loadThresholdConfig()
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to load thresholds", "details": err.Error()})
return
}
c.JSON(http.StatusOK, toAlertThresholdView(cfg))
}
// UpdateAlertThresholdsRequest is the request body for updating thresholds
type UpdateAlertThresholdsRequest struct {
GlobalQPS *int64 `json:"global_qps"`
MasterRPM *int64 `json:"master_rpm"`
MasterRPD *int64 `json:"master_rpd"`
MasterTPM *int64 `json:"master_tpm"`
MasterTPD *int64 `json:"master_tpd"`
MinRPMRequests1m *int64 `json:"min_rpm_requests_1m"`
MinTPMTokens1m *int64 `json:"min_tpm_tokens_1m"`
}
// UpdateAlertThresholds godoc
// @Summary Update alert thresholds
// @Description Update alert threshold configuration for traffic spike detection
// @Tags admin
// @Accept json
// @Produce json
// @Security AdminAuth
// @Param request body UpdateAlertThresholdsRequest true "Threshold configuration"
// @Success 200 {object} AlertThresholdView
// @Failure 400 {object} gin.H
// @Failure 500 {object} gin.H
// @Router /admin/alerts/thresholds [put]
func (h *AlertHandler) UpdateAlertThresholds(c *gin.Context) {
var req UpdateAlertThresholdsRequest
if err := c.ShouldBindJSON(&req); err != nil {
c.JSON(http.StatusBadRequest, gin.H{"error": err.Error()})
return
}
// Validate positive values
if req.GlobalQPS != nil && *req.GlobalQPS <= 0 {
c.JSON(http.StatusBadRequest, gin.H{"error": "global_qps must be positive"})
return
}
if req.MasterRPM != nil && *req.MasterRPM <= 0 {
c.JSON(http.StatusBadRequest, gin.H{"error": "master_rpm must be positive"})
return
}
if req.MasterRPD != nil && *req.MasterRPD <= 0 {
c.JSON(http.StatusBadRequest, gin.H{"error": "master_rpd must be positive"})
return
}
if req.MasterTPM != nil && *req.MasterTPM <= 0 {
c.JSON(http.StatusBadRequest, gin.H{"error": "master_tpm must be positive"})
return
}
if req.MasterTPD != nil && *req.MasterTPD <= 0 {
c.JSON(http.StatusBadRequest, gin.H{"error": "master_tpd must be positive"})
return
}
if req.MinRPMRequests1m != nil && *req.MinRPMRequests1m < 0 {
c.JSON(http.StatusBadRequest, gin.H{"error": "min_rpm_requests_1m must be non-negative"})
return
}
if req.MinTPMTokens1m != nil && *req.MinTPMTokens1m < 0 {
c.JSON(http.StatusBadRequest, gin.H{"error": "min_tpm_tokens_1m must be non-negative"})
return
}
cfg, err := h.loadThresholdConfig()
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to load thresholds", "details": err.Error()})
return
}
// Apply updates
if req.GlobalQPS != nil {
cfg.GlobalQPS = *req.GlobalQPS
}
if req.MasterRPM != nil {
cfg.MasterRPM = *req.MasterRPM
}
if req.MasterRPD != nil {
cfg.MasterRPD = *req.MasterRPD
}
if req.MasterTPM != nil {
cfg.MasterTPM = *req.MasterTPM
}
if req.MasterTPD != nil {
cfg.MasterTPD = *req.MasterTPD
}
if req.MinRPMRequests1m != nil {
cfg.MinRPMRequests1m = *req.MinRPMRequests1m
}
if req.MinTPMTokens1m != nil {
cfg.MinTPMTokens1m = *req.MinTPMTokens1m
}
if err := h.db.Save(&cfg).Error; err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to save thresholds", "details": err.Error()})
return
}
c.JSON(http.StatusOK, toAlertThresholdView(cfg))
}
// loadThresholdConfig loads the threshold config from DB or returns defaults
func (h *AlertHandler) loadThresholdConfig() (model.AlertThresholdConfig, error) {
var cfg model.AlertThresholdConfig
err := h.db.First(&cfg).Error
if err != nil {
if err.Error() == "record not found" {
// Create default config
cfg = model.DefaultAlertThresholdConfig()
if createErr := h.db.Create(&cfg).Error; createErr != nil {
return cfg, createErr
}
return cfg, nil
}
return cfg, err
}
return cfg, nil
}