feat: add prometheusrule for metrics service

This commit is contained in:
icecheng 2025-09-23 14:20:52 +08:00
parent e474d5dff8
commit 8f7beabe4c
6 changed files with 212 additions and 24 deletions

View File

@ -0,0 +1,32 @@
# Prometheus Alter Rule Config
Add `prometheusrule.yaml` to `<helm-pkg>/templates`.
see
```
{{- /*
Copyright Broadcom, Inc. All Rights Reserved.
SPDX-License-Identifier: APACHE-2.0
*/}}
{{- if and .Values.metrics.enabled .Values.metrics.prometheusRule.enabled }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: {{ include "common.names.fullname" . }}
namespace: {{ default (include "common.names.namespace" .) .Values.metrics.prometheusRule.namespace | quote}}
labels: {{- include "common.labels.standard" ( dict "customLabels" .Values.commonLabels "context" $ ) | nindent 4 }}
{{- if .Values.metrics.prometheusRule.additionalLabels }}
{{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 4 }}
{{- end }}
{{- if .Values.commonAnnotations }}
annotations: {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 4 }}
{{- end }}
spec:
groups:
{{- with .Values.metrics.prometheusRule.rules }}
- name: {{ template "common.names.name" $ }}
rules: {{- include "common.tplvalues.render" (dict "value" . "context" $) | nindent 8 }}
{{- end }}
{{- end }}
```

View File

@ -0,0 +1,37 @@
{{- /*
Copyright Broadcom, Inc. All Rights Reserved.
SPDX-License-Identifier: APACHE-2.0
*/}}
{{- if .Values.metrics.prometheusRule.enabled }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: {{ .Values.metrics.prometheusRule.name }}
namespace: {{ .Values.metrics.prometheusRule.namespace | quote }}
{{- with .Values.metrics.prometheusRule.labels }}
labels:
{{- toYaml . | nindent 4 }}
{{- end }}
spec:
groups:
{{- with .Values.metrics.prometheusRule.rules }}
- name: {{ $.Values.metrics.prometheusRule.name }}
rules:
{{- range . }}
- alert: {{ .alert }}
expr: {{ .expr | quote }}
{{- if .for }}
for: {{ .for }}
{{- end }}
{{- if .labels }}
labels:
{{- toYaml .labels | nindent 12 }}
{{- end }}
{{- if .annotations }}
annotations:
{{- toYaml .annotations | nindent 12 }}
{{- end }}
{{- end }}
{{- end }}
{{- end }}

View File

@ -81,3 +81,30 @@ metrics:
controlledResources:
- cpu
- memory
prometheusRule:
name: freepeals-alpha-metrics
enabled: false
namespace: "freeleaps-monitoring-system"
rules:
- alert: FreeleapsMetricsServiceDown
expr: up{job="metrics-service"} == 0
for: 1m
labels:
severity: critical
service: metrics-service
annotations:
summary: "Freeleaps Metrics service is down (instance {{ $labels.instance }})"
description: "Freeleaps Metrics service has been down for more than 1 minutes."
runbook_url: "https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7"
- alert: FreeleapsMetricsServiceHighErrorRate
expr: rate(http_requests_total{job="metrics-service",status=~"5.."}[5m]) > 0.1
for: 5m
labels:
severity: warning
service: metrics-service
annotations:
summary: "High error rate in freeleaps metrics service (instance {{ $labels.instance }})"
description: "Freeleaps Metrics service error rate is {{ $value }} errors per second."
runbook_url: "https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7"

View File

@ -81,3 +81,29 @@ metrics:
controlledResources:
- cpu
- memory
prometheusRule:
name: freepeals-prod-metrics
enabled: true
namespace: "freeleaps-monitoring-system"
rules:
- alert: FreeleapsMetricsServiceDown
expr: up{job="metrics-service"} == 0
for: 1m
labels:
severity: critical
service: metrics-service
annotations:
summary: "Freeleaps Metrics service is down (instance {{ $labels.instance }})"
description: "Freeleaps Metrics service has been down for more than 1 minutes."
runbook_url: "https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7"
- alert: FreeleapsMetricsServiceHighErrorRate
expr: rate(http_requests_total{job="metrics-service",status=~"5.."}[5m]) > 0.1
for: 5m
labels:
severity: warning
service: metrics-service
annotations:
summary: "High error rate in freeleaps metrics service (instance {{ $labels.instance }})"
description: "Freeleaps Metrics service error rate is {{ $value }} errors per second."
runbook_url: "https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7"

View File

@ -55,12 +55,12 @@ metrics:
port: 8009
targetPort: 8009
serviceMonitor:
enabled: false
enabled: true
labels:
release: kube-prometheus-stack
namespace: freeleaps-monitoring-system
internal: 30s
scrapeTimeout: ''
interval: 30s
scrapeTimeout: 10s
configs:
starrocksHost: ""
starrocksPort: 8009
@ -80,3 +80,69 @@ metrics:
controlledResources:
- cpu
- memory
prometheusRule:
name: freepeals-metrics
enabled: true
namespace: "freeleaps-monitoring-system"
rules:
- alert: FreeleapsMetricsServiceDown
expr: up{job="metrics-service"} == 0
for: 1m
labels:
severity: critical
service: metrics-service
annotations:
summary: "Freeleaps Metrics service is down (instance {{ $labels.instance }})"
description: "Freeleaps Metrics service has been down for more than 1 minutes."
runbook_url: "https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7"
- alert: FreeleapsMetricsServiceHighErrorRate
expr: rate(http_requests_total{job="metrics-service",status=~"5.."}[5m]) > 0.1
for: 5m
labels:
severity: warning
service: metrics-service
annotations:
summary: "High error rate in freeleaps metrics service (instance {{ $labels.instance }})"
description: "Freeleaps Metrics service error rate is {{ $value }} errors per second."
runbook_url: "https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7"
# - alert: MetricsServiceHighLatency
# expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{job="metrics-service"}[5m])) > 1
# for: 5m
# labels:
# severity: warning
# service: metrics-service
# annotations:
# summary: "High latency in metrics service (instance {{ $labels.instance }})"
# description: "95th percentile latency is {{ $value }} seconds."
# - alert: MetricsServiceHighMemoryUsage
# expr: (process_resident_memory_bytes{job="metrics-service"} / 1024 / 1024) > 512
# for: 5m
# labels:
# severity: warning
# service: metrics
# annotations:
# summary: "High memory usage in metrics service (instance {{ $labels.instance }})"
# description: "Memory usage is {{ $value }} MB."
# - alert: MetricsServiceHighCPUUsage
# expr: rate(process_cpu_seconds_total{job="metrics-service"}[5m]) * 100 > 80
# for: 5m
# labels:
# severity: warning
# service: metrics
# annotations:
# summary: "High CPU usage in metrics service (instance {{ $labels.instance }})"
# description: "CPU usage is {{ $value }}%."
# - alert: MetricsServiceNoData
# expr: absent(up{job="metrics-service"})
# for: 5m
# labels:
# severity: critical
# service: metrics
# annotations:
# summary: "No data from metrics service (instance {{ $labels.instance }})"
# description: "No metrics data received from metrics service for more than 5 minutes."

View File

@ -10,6 +10,27 @@ executeFreeleapsPipeline {
executeMode = 'fully'
commitMessageLintEnabled = false
components = [
[
name: 'metrics',
root: 'apps/metrics',
language: 'python',
dependenciesManager: 'pip',
requirementsFile: 'requirements.txt',
buildCacheEnabled: true,
buildAgentImage: 'python:3.12-slim',
buildArtifacts: ['.'],
lintEnabled: false,
sastEnabled: false,
imageRegistry: 'docker.io',
imageRepository: 'freeleaps',
imageName: 'devops',
imageBuilder: 'dind',
dockerfilePath: 'Dockerfile',
imageBuildRoot: '.',
imageReleaseArchitectures: ['linux/amd64', 'linux/arm64/v8'],
registryCredentialsId: 'freeleaps-devops-docker-hub-credentials',
semanticReleaseEnabled: true
],
[
name: 'authentication',
root: 'apps/authentication',
@ -135,27 +156,6 @@ executeFreeleapsPipeline {
imageReleaseArchitectures: ['linux/amd64', 'linux/arm64/v8'],
registryCredentialsId: 'freeleaps-devops-docker-hub-credentials',
semanticReleaseEnabled: true
],
[
name: 'metrics',
root: 'apps/metrcis',
language: 'python',
dependenciesManager: 'pip',
requirementsFile: 'requirements.txt',
buildCacheEnabled: true,
buildAgentImage: 'python:3.12-slim',
buildArtifacts: ['.'],
lintEnabled: false,
sastEnabled: false,
imageRegistry: 'docker.io',
imageRepository: 'freeleaps',
imageName: 'devops',
imageBuilder: 'dind',
dockerfilePath: 'Dockerfile',
imageBuildRoot: '.',
imageReleaseArchitectures: ['linux/amd64', 'linux/arm64/v8'],
registryCredentialsId: 'freeleaps-devops-docker-hub-credentials',
semanticReleaseEnabled: true
]
]
}