This commit is contained in:
Nicolas 2025-09-23 17:38:27 +08:00
commit d9209de99a
14 changed files with 124 additions and 62 deletions

View File

@ -1,32 +1,91 @@
# Prometheus Alter Rule Config # 1. Prometheus Alert Rule Configuration
Add `prometheusrule.yaml` to `<helm-pkg>/templates`. ## 1.1. Add `prometheusrule.yaml` to `<helm-pkg>/templates`.
see
``` Example:
> Update metrics to your service name, see freeleaps-ops/freeleaps/helm-pkg/metrics
```yaml
{{- /* {{- /*
Copyright Broadcom, Inc. All Rights Reserved. Copyright Broadcom, Inc. All Rights Reserved.
SPDX-License-Identifier: APACHE-2.0 SPDX-License-Identifier: APACHE-2.0
*/}} */}}
{{- if and .Values.metrics.enabled .Values.metrics.prometheusRule.enabled }} {{- if .Values.metrics.prometheusRule.enabled }}
apiVersion: monitoring.coreos.com/v1 apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule kind: PrometheusRule
metadata: metadata:
name: {{ include "common.names.fullname" . }} name: {{ .Values.metrics.prometheusRule.name }}
namespace: {{ default (include "common.names.namespace" .) .Values.metrics.prometheusRule.namespace | quote}} namespace: {{ .Values.metrics.prometheusRule.namespace | quote }}
labels: {{- include "common.labels.standard" ( dict "customLabels" .Values.commonLabels "context" $ ) | nindent 4 }} {{- with .Values.metrics.prometheusRule.labels }}
{{- if .Values.metrics.prometheusRule.additionalLabels }} labels:
{{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 4 }} {{- toYaml . | nindent 4 }}
{{- end }}
{{- if .Values.commonAnnotations }}
annotations: {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 4 }}
{{- end }} {{- end }}
spec: spec:
groups: groups:
{{- with .Values.metrics.prometheusRule.rules }} {{- with .Values.metrics.prometheusRule.rules }}
- name: {{ template "common.names.name" $ }} - name: {{ $.Values.metrics.prometheusRule.name }}
rules: {{- include "common.tplvalues.render" (dict "value" . "context" $) | nindent 8 }} rules:
{{- range . }}
- alert: {{ .alert }}
expr: {{ .expr | quote }}
{{- if .for }}
for: {{ .for }}
{{- end }}
{{- if .labels }}
labels:
{{- toYaml .labels | nindent 12 }}
{{- end }}
{{- if .annotations }}
annotations:
{{- toYaml .annotations | nindent 12 }}
{{- end }}
{{- end }}
{{- end }} {{- end }}
{{- end }} {{- end }}
``` ```
## 1.2. Add prometheusrule configuration to values.{alpha/prod}.yaml
Example:
> See freeleaps-ops/freeleaps/helm-pkg/metrics
```yaml
prometheusRule:
name: freepeals-metrics
enabled: true
namespace: "freeleaps-monitoring-system"
labels:
release: kube-prometheus-stack
rules:
- alert: FreeleapsMetricsServiceDown
expr: up{job="metrics-service"} == 0
for: 1m
labels:
severity: critical
service: metrics-service
annotations:
summary: "Freeleaps Metrics service is down (instance {{ $labels.instance }})"
description: "Freeleaps Metrics service has been down for more than 1 minutes."
runbook_url: "https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7"
- alert: FreeleapsMetricsServiceHighErrorRate
expr: rate(http_requests_total{job="metrics-service",status=~"5.."}[5m]) > 0.1
for: 5m
labels:
severity: warning
service: metrics-service
annotations:
summary: "High error rate in freeleaps metrics service (instance {{ $labels.instance }})"
description: "Freeleaps Metrics service error rate is {{ $value }} errors per second."
runbook_url: "https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7"
```
## 1.3. Verify Alert Rule Configuration is Effective
> Redirect to local
![alt text](asserts/image4.png)
> You can see the newly added rules indicating they are effective
![alt text](asserts/image5.png)

BIN
docs/asserts/image4.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 569 KiB

BIN
docs/asserts/image5.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 341 KiB

View File

@ -25,6 +25,7 @@ auth:
securePassword: true securePassword: true
updatePassword: false updatePassword: false
enableLoopbackUser: false enableLoopbackUser: false
erlangCookie: "iGF4ZVjeaXgHW2xELZTxJl8a6aFY7nes"
tls: tls:
enabled: false enabled: false
logs: "-" logs: "-"

View File

@ -162,6 +162,7 @@ auth:
## ref: https://github.com/bitnami/containers/tree/main/bitnami/rabbitmq#environment-variables ## ref: https://github.com/bitnami/containers/tree/main/bitnami/rabbitmq#environment-variables
## ##
updatePassword: false updatePassword: false
## @param auth.existingPasswordSecret Existing secret with RabbitMQ credentials (existing secret must contain a value for `rabbitmq-password` key or override with setting auth.existingSecretPasswordKey) ## @param auth.existingPasswordSecret Existing secret with RabbitMQ credentials (existing secret must contain a value for `rabbitmq-password` key or override with setting auth.existingSecretPasswordKey)
## e.g: ## e.g:
## existingPasswordSecret: name-of-existing-secret ## existingPasswordSecret: name-of-existing-secret
@ -177,7 +178,7 @@ auth:
## @param auth.erlangCookie Erlang cookie to determine whether different nodes are allowed to communicate with each other ## @param auth.erlangCookie Erlang cookie to determine whether different nodes are allowed to communicate with each other
## ref: https://github.com/bitnami/containers/tree/main/bitnami/rabbitmq#environment-variables ## ref: https://github.com/bitnami/containers/tree/main/bitnami/rabbitmq#environment-variables
## ##
erlangCookie: "" erlangCookie: "iGF4ZVjeaXgHW2xELZTxJl8a6aFY7nes"
## @param auth.existingErlangSecret Existing secret with RabbitMQ Erlang cookie (must contain a value for `rabbitmq-erlang-cookie` key or override with auth.existingSecretErlangKey) ## @param auth.existingErlangSecret Existing secret with RabbitMQ Erlang cookie (must contain a value for `rabbitmq-erlang-cookie` key or override with auth.existingSecretErlangKey)
## e.g: ## e.g:
## existingErlangSecret: name-of-existing-secret ## existingErlangSecret: name-of-existing-secret

View File

@ -1,7 +1,3 @@
global:
registry: docker.io
repository: freeleaps
nodeSelector: {}
dashboard: dashboard:
enabled: true enabled: true
name: freeleaps-prod-authentication-dashboard name: freeleaps-prod-authentication-dashboard
@ -20,7 +16,7 @@ authentication:
registry: docker.io registry: docker.io
repository: null repository: null
name: authentication name: authentication
tag: 1.9.0 tag: snapshot-ee519ca
imagePullPolicy: IfNotPresent imagePullPolicy: IfNotPresent
ports: ports:
- name: http - name: http

View File

@ -18,7 +18,7 @@ content:
registry: docker.io registry: docker.io
repository: null repository: null
name: content name: content
tag: 1.9.0 tag: snapshot-ee519ca
imagePullPolicy: IfNotPresent imagePullPolicy: IfNotPresent
ports: ports:
- name: http - name: http

View File

@ -10,7 +10,7 @@ devops:
registry: docker.io registry: docker.io
repository: null repository: null
name: devops name: devops
tag: snapshot-96f2f52 tag: snapshot-3cba9e4
imagePullPolicy: IfNotPresent imagePullPolicy: IfNotPresent
ports: ports:
- name: http - name: http

View File

@ -84,7 +84,9 @@ metrics:
prometheusRule: prometheusRule:
name: freepeals-alpha-metrics name: freepeals-alpha-metrics
enabled: false enabled: false
namespace: freeleaps-monitoring-system namespace: "freeleaps-monitoring-system"
labels:
release: kube-prometheus-stack
rules: rules:
- alert: FreeleapsMetricsServiceDown - alert: FreeleapsMetricsServiceDown
expr: up{job="metrics-service"} == 0 expr: up{job="metrics-service"} == 0

View File

@ -15,7 +15,7 @@ metrics:
registry: docker.io registry: docker.io
repository: null repository: null
name: metrics name: metrics
tag: snapshot-38ff0ae tag: snapshot-3cba9e4
imagePullPolicy: IfNotPresent imagePullPolicy: IfNotPresent
ports: ports:
- name: http - name: http
@ -85,6 +85,8 @@ metrics:
name: freepeals-prod-metrics name: freepeals-prod-metrics
enabled: true enabled: true
namespace: "freeleaps-monitoring-system" namespace: "freeleaps-monitoring-system"
labels:
release: kube-prometheus-stack
rules: rules:
- alert: FreeleapsMetricsServiceDown - alert: FreeleapsMetricsServiceDown
expr: up{job="metrics-service"} == 0 expr: up{job="metrics-service"} == 0
@ -93,10 +95,9 @@ metrics:
severity: critical severity: critical
service: metrics-service service: metrics-service
annotations: annotations:
summary: "Freeleaps Metrics service is down (instance {{ $labels.instance }})" summary: Freeleaps Metrics service is down (instance {{ $labels.instance }})
description: "Freeleaps Metrics service has been down for more than 1 minutes." description: Freeleaps Metrics service has been down for more than 1 minutes.
runbook_url: "https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7" runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
- alert: FreeleapsMetricsServiceHighErrorRate - alert: FreeleapsMetricsServiceHighErrorRate
expr: rate(http_requests_total{job="metrics-service",status=~"5.."}[5m]) > 0.1 expr: rate(http_requests_total{job="metrics-service",status=~"5.."}[5m]) > 0.1
for: 5m for: 5m
@ -104,6 +105,6 @@ metrics:
severity: warning severity: warning
service: metrics-service service: metrics-service
annotations: annotations:
summary: "High error rate in freeleaps metrics service (instance {{ $labels.instance }})" summary: High error rate in freeleaps metrics service (instance {{ $labels.instance }})
description: "Freeleaps Metrics service error rate is {{ $value }} errors per second." description: Freeleaps Metrics service error rate is {{ $value }} errors per second.
runbook_url: "https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7" runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7

View File

@ -84,6 +84,8 @@ metrics:
name: freepeals-metrics name: freepeals-metrics
enabled: true enabled: true
namespace: "freeleaps-monitoring-system" namespace: "freeleaps-monitoring-system"
labels:
release: kube-prometheus-stack
rules: rules:
- alert: FreeleapsMetricsServiceDown - alert: FreeleapsMetricsServiceDown
expr: up{job="metrics-service"} == 0 expr: up{job="metrics-service"} == 0

View File

@ -32,23 +32,23 @@ notification:
readiness: readiness:
type: httpGet type: httpGet
config: config:
path: /api/_/readyz path: /api/_/livez
port: 8003 port: 8003
initialDelaySeconds: 30 initialDelaySeconds: 60
periodSeconds: 30 periodSeconds: 30
timeoutSeconds: 3 timeoutSeconds: 3
successThreshold: 1 successThreshold: 1
failureThreshold: 3 failureThreshold: 5
liveness: liveness:
type: httpGet type: httpGet
config: config:
path: /api/_/livez path: /api/_/livez
port: 8003 port: 8003
initialDelaySeconds: 30 initialDelaySeconds: 60
periodSeconds: 15 periodSeconds: 30
timeoutSeconds: 3 timeoutSeconds: 3
successThreshold: 1 successThreshold: 1
failureThreshold: 3 failureThreshold: 5
terminationGracePeriodSeconds: 30 terminationGracePeriodSeconds: 30
services: services:
- name: notification-service - name: notification-service

View File

@ -13,7 +13,7 @@ notification:
registry: docker.io registry: docker.io
repository: null repository: null
name: notification name: notification
tag: 1.9.0 tag: snapshot-ee519ca
imagePullPolicy: IfNotPresent imagePullPolicy: IfNotPresent
ports: ports:
- name: http - name: http
@ -30,23 +30,23 @@ notification:
readiness: readiness:
type: httpGet type: httpGet
config: config:
path: /api/_/readyz path: /api/_/livez
port: 8003 port: 8003
initialDelaySeconds: 30 initialDelaySeconds: 60
periodSeconds: 30 periodSeconds: 30
timeoutSeconds: 3 timeoutSeconds: 3
successThreshold: 1 successThreshold: 1
failureThreshold: 3 failureThreshold: 5
liveness: liveness:
type: httpGet type: httpGet
config: config:
path: /api/_/livez path: /api/_/livez
port: 8003 port: 8003
initialDelaySeconds: 30 initialDelaySeconds: 60
periodSeconds: 15 periodSeconds: 30
timeoutSeconds: 3 timeoutSeconds: 3
successThreshold: 1 successThreshold: 1
failureThreshold: 3 failureThreshold: 5
terminationGracePeriodSeconds: 30 terminationGracePeriodSeconds: 30
services: services:
- name: notification-service - name: notification-service

View File

@ -23,7 +23,7 @@ executeFreeleapsPipeline {
sastEnabled: false, sastEnabled: false,
imageRegistry: 'docker.io', imageRegistry: 'docker.io',
imageRepository: 'freeleaps', imageRepository: 'freeleaps',
imageName: 'devops', imageName: 'metrics',
imageBuilder: 'dind', imageBuilder: 'dind',
dockerfilePath: 'Dockerfile', dockerfilePath: 'Dockerfile',
imageBuildRoot: '.', imageBuildRoot: '.',