feat: add Service Monitor and Error Alter Integration Guideline
This commit is contained in:
parent
07a0af3a35
commit
6a428b434c
@ -1,10 +1,16 @@
|
||||
# 1. Prometheus Alert Rule Configuration
|
||||
# 1. Prerequisites
|
||||
|
||||
## 1.1. Add `prometheusrule.yaml` to `<helm-pkg>/templates`.
|
||||
Before proceeding with the steps in this document, ensure your service has integrated Prometheus metrics collection. For details, refer to
|
||||
[prometheus-metrics-intergration-guideline.md](prometheus-metrics-intergration-guideline.md)
|
||||
|
||||
# 2. Prometheus Alert Rule Configuration
|
||||
|
||||
|
||||
## 2.1. Add `prometheusrule.yaml` to `<helm-pkg>/templates`.
|
||||
|
||||
Example:
|
||||
|
||||
> Update metrics to your service name, see freeleaps-ops/freeleaps/helm-pkg/metrics
|
||||
> Update the metrics configuration to your service name. See `freeleaps-ops/freeleaps/helm-pkg/metrics`.
|
||||
```yaml
|
||||
{{- /*
|
||||
Copyright Broadcom, Inc. All Rights Reserved.
|
||||
@ -45,40 +51,41 @@ spec:
|
||||
{{- end }}
|
||||
```
|
||||
|
||||
## 1.2. Add prometheusrule configuration to values.{alpha/prod}.yaml
|
||||
## 2.2. Add prometheusrule configuration to `values.{alpha/prod}.yaml`
|
||||
Example:
|
||||
|
||||
> See freeleaps-ops/freeleaps/helm-pkg/metrics
|
||||
> See `freeleaps-ops/freeleaps/helm-pkg/metrics`.
|
||||
|
||||
```yaml
|
||||
prometheusRule:
|
||||
name: freepeals-metrics
|
||||
enabled: true
|
||||
namespace: "freeleaps-monitoring-system"
|
||||
name: freepeals-prod-metrics
|
||||
enabled: true # disable in alpha environment
|
||||
namespace: freeleaps-monitoring-system
|
||||
labels:
|
||||
release: kube-prometheus-stack
|
||||
rules:
|
||||
- alert: FreeleapsMetricsServiceDown
|
||||
expr: up{job="metrics-service"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
service: metrics-service
|
||||
annotations:
|
||||
summary: "Freeleaps Metrics service is down (instance {{ $labels.instance }})"
|
||||
description: "Freeleaps Metrics service has been down for more than 1 minutes."
|
||||
runbook_url: "https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7"
|
||||
|
||||
- alert: FreeleapsMetricsServiceHighErrorRate
|
||||
expr: rate(http_requests_total{job="metrics-service",status=~"5.."}[5m]) > 0.1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: metrics-service
|
||||
annotations:
|
||||
summary: "High error rate in freeleaps metrics service (instance {{ $labels.instance }})"
|
||||
description: "Freeleaps Metrics service error rate is {{ $value }} errors per second."
|
||||
runbook_url: "https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7"
|
||||
- alert: FreeleapsMetricsServiceDown # Service down alert
|
||||
expr: up{job="metrics-service"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical # severity: warning/info/critical
|
||||
service: metrics-service # service name
|
||||
namespace: freeleaps-prod # namespace of the service
|
||||
annotations:
|
||||
summary: Freeleaps Metrics service is down (instance {{ $labels.instance }}) # summary
|
||||
description: Freeleaps Metrics service has been down for more than 1 minute. # description
|
||||
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7 # Runbook url
|
||||
- alert: FreeleapsMetricsServiceHighErrorRate
|
||||
expr: rate(http_requests_total{job="metrics-service",status=~"5.."}[5m]) > 0.1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: metrics-service
|
||||
namespace: freeleaps-prod
|
||||
annotations:
|
||||
summary: High error rate in freeleaps metrics service (instance {{ $labels.instance }})
|
||||
description: Freeleaps Metrics service error rate is {{ $value }} errors per second.
|
||||
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||
```
|
||||
|
||||
## 1.3. Verify Alert Rule Configuration is Effective
|
||||
@ -88,4 +95,84 @@ prometheusRule:
|
||||
|
||||
> You can see the newly added rules indicating they are effective
|
||||
|
||||

|
||||

|
||||
|
||||
|
||||
# 3. Add AlertmanagerConfig (Email Notifications)
|
||||
## 3.1 Add AlertmanagerConfig
|
||||
> If there is no `AlertmanagerConfig` in the namespace, create one. If it already exists, no action is required.
|
||||
|
||||
To create a new `AlertmanagerConfig`, refer to `freeleaps-ops/altermanager/altermanager-config.yaml`.
|
||||
```
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
type: Opaque
|
||||
metadata:
|
||||
name: altermanager-email-credentials
|
||||
namespace: freeleaps-prod # The namespace whose service alerts you want to configure
|
||||
data:
|
||||
password: cHducGNya3d0aXp5Z2RoZQ==
|
||||
---
|
||||
apiVersion: monitoring.coreos.com/v1alpha1
|
||||
kind: AlertmanagerConfig
|
||||
metadata:
|
||||
name: alertmanager-config
|
||||
namespace: freeleaps-prod # The namespace whose service alerts you want to configure
|
||||
spec:
|
||||
receivers:
|
||||
# - msteamsConfigs:
|
||||
# - sendResolved: true
|
||||
# text: '{{ template "msteams.default.text" . }}'
|
||||
# title: >-
|
||||
# {{ if eq .Status "firing" }}🚨 [FIRING] 🔥{{- else -}}🙌 [RESOLVED]
|
||||
# 🍻{{- end -}}
|
||||
# webhookUrl:
|
||||
# key: webhook-url
|
||||
# name: freeleaps-teams-webhook
|
||||
# name: ms-teams
|
||||
- emailConfigs:
|
||||
- to: "icecheng@mathmast.com" # email recipient
|
||||
from: "support@freeleaps.com" # email sender
|
||||
smarthost: "smtp.freeleaps.com:465"
|
||||
authUsername: "support@freeleaps.com"
|
||||
authPassword:
|
||||
name: "altermanager-email-credentials"
|
||||
key: "password"
|
||||
authIdentity: "support@freeleaps.com"
|
||||
requireTLS: false
|
||||
sendResolved: true
|
||||
headers: # email Subject configuration
|
||||
- key: Subject
|
||||
value: '{{ if eq .Status "firing" }}🚨 Freeleaps Alert: {{ .CommonAnnotations.summary }}{{ else }}✅ Freeleaps Resolved: {{ .CommonAnnotations.summary }}{{ end }}'
|
||||
html: |- # email content configuration
|
||||
<h3><strong>{{ if eq .Status "firing" }}🚨 Alert: {{ .CommonAnnotations.summary }}{{ else }}✅ Resolved: {{ .CommonAnnotations.summary }}{{ end }}</strong></h3>
|
||||
<p><strong>📝 AlertName:</strong> {{ .CommonLabels.alertname }}</p>
|
||||
<p><strong>🔧 Service:</strong> {{ .CommonLabels.service }}</p>
|
||||
<p><strong>🔧 Pod:</strong> {{ .CommonLabels.pod }}({{ .CommonLabels.instance }})</p>
|
||||
<p><strong>🏷️ Severity:</strong> {{ .CommonLabels.severity }}</p>
|
||||
<p><strong>{{ if eq .Status "firing" }}🔴 Status:{{ else }}🟢 Status:{{ end }}</strong> {{ .Status | toUpper }}</p>
|
||||
<p>📝 Description: {{ .CommonAnnotations.description }}</p>
|
||||
<p>📖 Runbook: <a href="{{ .CommonAnnotations.runbook_url }}">{{ .CommonAnnotations.runbook_url }}</a></p>
|
||||
name: email
|
||||
route:
|
||||
groupBy:
|
||||
- severity
|
||||
groupInterval: 5m
|
||||
receiver: email
|
||||
groupWait: 5m
|
||||
repeatInterval: 6h
|
||||
```
|
||||
## 3.2. Verify Configuration Success
|
||||
|
||||
> Trigger an alert and check the pages below for alert data. If present, the configuration is successful.
|
||||
|
||||

|
||||

|
||||
|
||||
## 3.3. Verify Email Notification Success
|
||||

|
||||

|
||||
|
||||
# 4. Teams Alert Integration
|
||||
|
||||
TODO
|
||||
BIN
docs/asserts/image6.png
Normal file
BIN
docs/asserts/image6.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 714 KiB |
BIN
docs/asserts/image7.png
Normal file
BIN
docs/asserts/image7.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 357 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 307 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 299 KiB |
Loading…
Reference in New Issue
Block a user