feat: alert manager set-up for all services
This commit is contained in:
parent
6f0b37b30a
commit
9ece101d8c
@ -0,0 +1,37 @@
|
|||||||
|
{{- /*
|
||||||
|
Copyright Broadcom, Inc. All Rights Reserved.
|
||||||
|
SPDX-License-Identifier: APACHE-2.0
|
||||||
|
*/}}
|
||||||
|
|
||||||
|
{{- if .Values.central-storage.prometheusRule.enabled }}
|
||||||
|
apiVersion: monitoring.coreos.com/v1
|
||||||
|
kind: PrometheusRule
|
||||||
|
metadata:
|
||||||
|
name: {{ .Values.central-storage.prometheusRule.name }}
|
||||||
|
namespace: {{ .Values.central-storage.prometheusRule.namespace | quote }}
|
||||||
|
{{- with .Values.central-storage.prometheusRule.labels }}
|
||||||
|
labels:
|
||||||
|
{{- toYaml . | nindent 4 }}
|
||||||
|
{{- end }}
|
||||||
|
spec:
|
||||||
|
groups:
|
||||||
|
{{- with .Values.central-storage.prometheusRule.rules }}
|
||||||
|
- name: {{ $.Values.central-storage.prometheusRule.name }}
|
||||||
|
rules:
|
||||||
|
{{- range . }}
|
||||||
|
- alert: {{ .alert }}
|
||||||
|
expr: {{ .expr | quote }}
|
||||||
|
{{- if .for }}
|
||||||
|
for: {{ .for }}
|
||||||
|
{{- end }}
|
||||||
|
{{- if .labels }}
|
||||||
|
labels:
|
||||||
|
{{- toYaml .labels | nindent 12 }}
|
||||||
|
{{- end }}
|
||||||
|
{{- if .annotations }}
|
||||||
|
annotations:
|
||||||
|
{{- toYaml .annotations | nindent 12 }}
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
||||||
@ -119,3 +119,30 @@ centralStorage:
|
|||||||
controlledResources:
|
controlledResources:
|
||||||
- cpu
|
- cpu
|
||||||
- memory
|
- memory
|
||||||
|
prometheusRule:
|
||||||
|
name: freepeals-alpha-central-storage
|
||||||
|
enabled: false
|
||||||
|
namespace: freeleaps-monitoring-system
|
||||||
|
labels:
|
||||||
|
release: kube-prometheus-stack
|
||||||
|
rules:
|
||||||
|
- alert: FreeleapsCentralStorageServiceDown
|
||||||
|
expr: up{job="central-storage-service"} == 0
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
service: central-storage-service
|
||||||
|
annotations:
|
||||||
|
summary: Freeleaps Central Storage service is down (instance {{ $labels.instance }})
|
||||||
|
description: Freeleaps Central Storage service has been down for more than 1 minutes.
|
||||||
|
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||||
|
- alert: FreeleapsCentralStorageServiceHighErrorRate
|
||||||
|
expr: rate(http_requests_total{job="central-storage-service",status=~"5.."}[5m]) > 0.1
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
service: central-storage-service
|
||||||
|
annotations:
|
||||||
|
summary: High error rate in freeleaps central storage service (instance {{ $labels.instance }})
|
||||||
|
description: Freeleaps Central Storage service error rate is {{ $value }} errors per second.
|
||||||
|
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||||
|
|||||||
37
freeleaps/helm-pkg/chat/templates/chat/prometheusrule.yaml
Normal file
37
freeleaps/helm-pkg/chat/templates/chat/prometheusrule.yaml
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
{{- /*
|
||||||
|
Copyright Broadcom, Inc. All Rights Reserved.
|
||||||
|
SPDX-License-Identifier: APACHE-2.0
|
||||||
|
*/}}
|
||||||
|
|
||||||
|
{{- if .Values.chat.prometheusRule.enabled }}
|
||||||
|
apiVersion: monitoring.coreos.com/v1
|
||||||
|
kind: PrometheusRule
|
||||||
|
metadata:
|
||||||
|
name: {{ .Values.chat.prometheusRule.name }}
|
||||||
|
namespace: {{ .Values.chat.prometheusRule.namespace | quote }}
|
||||||
|
{{- with .Values.chat.prometheusRule.labels }}
|
||||||
|
labels:
|
||||||
|
{{- toYaml . | nindent 4 }}
|
||||||
|
{{- end }}
|
||||||
|
spec:
|
||||||
|
groups:
|
||||||
|
{{- with .Values.chat.prometheusRule.rules }}
|
||||||
|
- name: {{ $.Values.chat.prometheusRule.name }}
|
||||||
|
rules:
|
||||||
|
{{- range . }}
|
||||||
|
- alert: {{ .alert }}
|
||||||
|
expr: {{ .expr | quote }}
|
||||||
|
{{- if .for }}
|
||||||
|
for: {{ .for }}
|
||||||
|
{{- end }}
|
||||||
|
{{- if .labels }}
|
||||||
|
labels:
|
||||||
|
{{- toYaml .labels | nindent 12 }}
|
||||||
|
{{- end }}
|
||||||
|
{{- if .annotations }}
|
||||||
|
annotations:
|
||||||
|
{{- toYaml .annotations | nindent 12 }}
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
||||||
@ -154,3 +154,30 @@ chat:
|
|||||||
controlledResources:
|
controlledResources:
|
||||||
- cpu
|
- cpu
|
||||||
- memory
|
- memory
|
||||||
|
prometheusRule:
|
||||||
|
name: freepeals-alpha-chat
|
||||||
|
enabled: false
|
||||||
|
namespace: freeleaps-monitoring-system
|
||||||
|
labels:
|
||||||
|
release: kube-prometheus-stack
|
||||||
|
rules:
|
||||||
|
- alert: FreeleapsChatServiceDown
|
||||||
|
expr: up{job="chat-service"} == 0
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
service: chat-service
|
||||||
|
annotations:
|
||||||
|
summary: Freeleaps Chat service is down (instance {{ $labels.instance }})
|
||||||
|
description: Freeleaps Chat service has been down for more than 1 minutes.
|
||||||
|
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||||
|
- alert: FreeleapsChatServiceHighErrorRate
|
||||||
|
expr: rate(http_requests_total{job="chat-service",status=~"5.."}[5m]) > 0.1
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
service: chat-service
|
||||||
|
annotations:
|
||||||
|
summary: High error rate in freeleaps chat service (instance {{ $labels.instance }})
|
||||||
|
description: Freeleaps Chat service error rate is {{ $value }} errors per second.
|
||||||
|
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||||
|
|||||||
@ -145,3 +145,31 @@ chat:
|
|||||||
controlledResources:
|
controlledResources:
|
||||||
- cpu
|
- cpu
|
||||||
- memory
|
- memory
|
||||||
|
prometheusRule:
|
||||||
|
name: freepeals-prod-chat
|
||||||
|
enabled: true
|
||||||
|
namespace: freeleaps-monitoring-system
|
||||||
|
labels:
|
||||||
|
release: kube-prometheus-stack
|
||||||
|
rules:
|
||||||
|
- alert: FreeleapsChatServiceDown
|
||||||
|
expr: up{job="chat-service"} == 0
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
service: chat-service
|
||||||
|
annotations:
|
||||||
|
summary: Freeleaps Chat service is down (instance {{ $labels.instance }})
|
||||||
|
description: Freeleaps Chat service has been down for more than 1 minutes.
|
||||||
|
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||||
|
- alert: FreeleapsChatServiceHighErrorRate
|
||||||
|
expr: rate(http_requests_total{job="chat-service",status=~"5.."}[5m]) > 0.1
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
service: chat-service
|
||||||
|
annotations:
|
||||||
|
summary: High error rate in freeleaps chat service (instance {{ $labels.instance }})
|
||||||
|
description: Freeleaps Chat service error rate is {{ $value }} errors per second.
|
||||||
|
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||||
|
|
||||||
|
|||||||
@ -0,0 +1,37 @@
|
|||||||
|
{{- /*
|
||||||
|
Copyright Broadcom, Inc. All Rights Reserved.
|
||||||
|
SPDX-License-Identifier: APACHE-2.0
|
||||||
|
*/}}
|
||||||
|
|
||||||
|
{{- if .Values.content.prometheusRule.enabled }}
|
||||||
|
apiVersion: monitoring.coreos.com/v1
|
||||||
|
kind: PrometheusRule
|
||||||
|
metadata:
|
||||||
|
name: {{ .Values.content.prometheusRule.name }}
|
||||||
|
namespace: {{ .Values.content.prometheusRule.namespace | quote }}
|
||||||
|
{{- with .Values.content.prometheusRule.labels }}
|
||||||
|
labels:
|
||||||
|
{{- toYaml . | nindent 4 }}
|
||||||
|
{{- end }}
|
||||||
|
spec:
|
||||||
|
groups:
|
||||||
|
{{- with .Values.content.prometheusRule.rules }}
|
||||||
|
- name: {{ $.Values.content.prometheusRule.name }}
|
||||||
|
rules:
|
||||||
|
{{- range . }}
|
||||||
|
- alert: {{ .alert }}
|
||||||
|
expr: {{ .expr | quote }}
|
||||||
|
{{- if .for }}
|
||||||
|
for: {{ .for }}
|
||||||
|
{{- end }}
|
||||||
|
{{- if .labels }}
|
||||||
|
labels:
|
||||||
|
{{- toYaml .labels | nindent 12 }}
|
||||||
|
{{- end }}
|
||||||
|
{{- if .annotations }}
|
||||||
|
annotations:
|
||||||
|
{{- toYaml .annotations | nindent 12 }}
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
||||||
@ -115,3 +115,30 @@ content:
|
|||||||
controlledResources:
|
controlledResources:
|
||||||
- cpu
|
- cpu
|
||||||
- memory
|
- memory
|
||||||
|
prometheusRule:
|
||||||
|
name: freepeals-alpha-content
|
||||||
|
enabled: false
|
||||||
|
namespace: freeleaps-monitoring-system
|
||||||
|
labels:
|
||||||
|
release: kube-prometheus-stack
|
||||||
|
rules:
|
||||||
|
- alert: FreeleapsContentServiceDown
|
||||||
|
expr: up{job="content-service"} == 0
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
service: content-service
|
||||||
|
annotations:
|
||||||
|
summary: Freeleaps Content service is down (instance {{ $labels.instance }})
|
||||||
|
description: Freeleaps Content service has been down for more than 1 minutes.
|
||||||
|
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||||
|
- alert: FreeleapsContentServiceHighErrorRate
|
||||||
|
expr: rate(http_requests_total{job="content-service",status=~"5.."}[5m]) > 0.1
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
service: content-service
|
||||||
|
annotations:
|
||||||
|
summary: High error rate in freeleaps content service (instance {{ $labels.instance }})
|
||||||
|
description: Freeleaps Content service error rate is {{ $value }} errors per second.
|
||||||
|
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||||
|
|||||||
@ -106,3 +106,30 @@ content:
|
|||||||
controlledResources:
|
controlledResources:
|
||||||
- cpu
|
- cpu
|
||||||
- memory
|
- memory
|
||||||
|
prometheusRule:
|
||||||
|
name: freepeals-prod-content
|
||||||
|
enabled: true
|
||||||
|
namespace: freeleaps-monitoring-system
|
||||||
|
labels:
|
||||||
|
release: kube-prometheus-stack
|
||||||
|
rules:
|
||||||
|
- alert: FreeleapsContentServiceDown
|
||||||
|
expr: up{job="content-service"} == 0
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
service: content-service
|
||||||
|
annotations:
|
||||||
|
summary: Freeleaps Content service is down (instance {{ $labels.instance }})
|
||||||
|
description: Freeleaps Content service has been down for more than 1 minutes.
|
||||||
|
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||||
|
- alert: FreeleapsContentServiceHighErrorRate
|
||||||
|
expr: rate(http_requests_total{job="content-service",status=~"5.."}[5m]) > 0.1
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
service: content-service
|
||||||
|
annotations:
|
||||||
|
summary: High error rate in freeleaps content service (instance {{ $labels.instance }})
|
||||||
|
description: Freeleaps Content service error rate is {{ $value }} errors per second.
|
||||||
|
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||||
|
|||||||
@ -0,0 +1,37 @@
|
|||||||
|
{{- /*
|
||||||
|
Copyright Broadcom, Inc. All Rights Reserved.
|
||||||
|
SPDX-License-Identifier: APACHE-2.0
|
||||||
|
*/}}
|
||||||
|
|
||||||
|
{{- if .Values.devops.prometheusRule.enabled }}
|
||||||
|
apiVersion: monitoring.coreos.com/v1
|
||||||
|
kind: PrometheusRule
|
||||||
|
metadata:
|
||||||
|
name: {{ .Values.devops.prometheusRule.name }}
|
||||||
|
namespace: {{ .Values.devops.prometheusRule.namespace | quote }}
|
||||||
|
{{- with .Values.devops.prometheusRule.labels }}
|
||||||
|
labels:
|
||||||
|
{{- toYaml . | nindent 4 }}
|
||||||
|
{{- end }}
|
||||||
|
spec:
|
||||||
|
groups:
|
||||||
|
{{- with .Values.devops.prometheusRule.rules }}
|
||||||
|
- name: {{ $.Values.devops.prometheusRule.name }}
|
||||||
|
rules:
|
||||||
|
{{- range . }}
|
||||||
|
- alert: {{ .alert }}
|
||||||
|
expr: {{ .expr | quote }}
|
||||||
|
{{- if .for }}
|
||||||
|
for: {{ .for }}
|
||||||
|
{{- end }}
|
||||||
|
{{- if .labels }}
|
||||||
|
labels:
|
||||||
|
{{- toYaml .labels | nindent 12 }}
|
||||||
|
{{- end }}
|
||||||
|
{{- if .annotations }}
|
||||||
|
annotations:
|
||||||
|
{{- toYaml .annotations | nindent 12 }}
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
||||||
@ -120,3 +120,30 @@ devops:
|
|||||||
controlledResources:
|
controlledResources:
|
||||||
- cpu
|
- cpu
|
||||||
- memory
|
- memory
|
||||||
|
prometheusRule:
|
||||||
|
name: freepeals-alpha-devops
|
||||||
|
enabled: false
|
||||||
|
namespace: freeleaps-monitoring-system
|
||||||
|
labels:
|
||||||
|
release: kube-prometheus-stack
|
||||||
|
rules:
|
||||||
|
- alert: FreeleapsDevopsServiceDown
|
||||||
|
expr: up{job="devops-service"} == 0
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
service: devops-service
|
||||||
|
annotations:
|
||||||
|
summary: Freeleaps Devops service is down (instance {{ $labels.instance }})
|
||||||
|
description: Freeleaps Devops service has been down for more than 1 minutes.
|
||||||
|
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||||
|
- alert: FreeleapsDevopsServiceHighErrorRate
|
||||||
|
expr: rate(http_requests_total{job="devops-service",status=~"5.."}[5m]) > 0.1
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
service: devops-service
|
||||||
|
annotations:
|
||||||
|
summary: High error rate in freeleaps devops service (instance {{ $labels.instance }})
|
||||||
|
description: Freeleaps Devops service error rate is {{ $value }} errors per second.
|
||||||
|
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||||
|
|||||||
@ -97,3 +97,30 @@ devops:
|
|||||||
controlledResources:
|
controlledResources:
|
||||||
- cpu
|
- cpu
|
||||||
- memory
|
- memory
|
||||||
|
prometheusRule:
|
||||||
|
name: freepeals-prod-devops
|
||||||
|
enabled: true
|
||||||
|
namespace: freeleaps-monitoring-system
|
||||||
|
labels:
|
||||||
|
release: kube-prometheus-stack
|
||||||
|
rules:
|
||||||
|
- alert: FreeleapsDevopsServiceDown
|
||||||
|
expr: up{job="devops-service"} == 0
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
service: devops-service
|
||||||
|
annotations:
|
||||||
|
summary: Freeleaps Devops service is down (instance {{ $labels.instance }})
|
||||||
|
description: Freeleaps Devops service has been down for more than 1 minutes.
|
||||||
|
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||||
|
- alert: FreeleapsDevopsServiceHighErrorRate
|
||||||
|
expr: rate(http_requests_total{job="devops-service",status=~"5.."}[5m]) > 0.1
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
service: devops-service
|
||||||
|
annotations:
|
||||||
|
summary: High error rate in freeleaps devops service (instance {{ $labels.instance }})
|
||||||
|
description: Freeleaps Devops service error rate is {{ $value }} errors per second.
|
||||||
|
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||||
|
|||||||
@ -0,0 +1,37 @@
|
|||||||
|
{{- /*
|
||||||
|
Copyright Broadcom, Inc. All Rights Reserved.
|
||||||
|
SPDX-License-Identifier: APACHE-2.0
|
||||||
|
*/}}
|
||||||
|
|
||||||
|
{{- if .Values.devsvc.prometheusRule.enabled }}
|
||||||
|
apiVersion: monitoring.coreos.com/v1
|
||||||
|
kind: PrometheusRule
|
||||||
|
metadata:
|
||||||
|
name: {{ .Values.devsvc.prometheusRule.name }}
|
||||||
|
namespace: {{ .Values.devsvc.prometheusRule.namespace | quote }}
|
||||||
|
{{- with .Values.devsvc.prometheusRule.labels }}
|
||||||
|
labels:
|
||||||
|
{{- toYaml . | nindent 4 }}
|
||||||
|
{{- end }}
|
||||||
|
spec:
|
||||||
|
groups:
|
||||||
|
{{- with .Values.devsvc.prometheusRule.rules }}
|
||||||
|
- name: {{ $.Values.devsvc.prometheusRule.name }}
|
||||||
|
rules:
|
||||||
|
{{- range . }}
|
||||||
|
- alert: {{ .alert }}
|
||||||
|
expr: {{ .expr | quote }}
|
||||||
|
{{- if .for }}
|
||||||
|
for: {{ .for }}
|
||||||
|
{{- end }}
|
||||||
|
{{- if .labels }}
|
||||||
|
labels:
|
||||||
|
{{- toYaml .labels | nindent 12 }}
|
||||||
|
{{- end }}
|
||||||
|
{{- if .annotations }}
|
||||||
|
annotations:
|
||||||
|
{{- toYaml .annotations | nindent 12 }}
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
||||||
@ -147,3 +147,30 @@ devsvc:
|
|||||||
controlledResources:
|
controlledResources:
|
||||||
- cpu
|
- cpu
|
||||||
- memory
|
- memory
|
||||||
|
prometheusRule:
|
||||||
|
name: freepeals-alpha-devsvc
|
||||||
|
enabled: false
|
||||||
|
namespace: freeleaps-monitoring-system
|
||||||
|
labels:
|
||||||
|
release: kube-prometheus-stack
|
||||||
|
rules:
|
||||||
|
- alert: FreeleapsDevsvcServiceDown
|
||||||
|
expr: up{job="devsvc-service"} == 0
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
service: devsvc-service
|
||||||
|
annotations:
|
||||||
|
summary: Freeleaps Devsvc service is down (instance {{ $labels.instance }})
|
||||||
|
description: Freeleaps Devsvc service has been down for more than 1 minutes.
|
||||||
|
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||||
|
- alert: FreeleapsDevsvcServiceHighErrorRate
|
||||||
|
expr: rate(http_requests_total{job="devsvc-service",status=~"5.."}[5m]) > 0.1
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
service: devsvc-service
|
||||||
|
annotations:
|
||||||
|
summary: High error rate in freeleaps devsvc service (instance {{ $labels.instance }})
|
||||||
|
description: Freeleaps Devsvc service error rate is {{ $value }} errors per second.
|
||||||
|
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||||
|
|||||||
@ -138,3 +138,30 @@ devsvc:
|
|||||||
controlledResources:
|
controlledResources:
|
||||||
- cpu
|
- cpu
|
||||||
- memory
|
- memory
|
||||||
|
prometheusRule:
|
||||||
|
name: freepeals-prod-devsvc
|
||||||
|
enabled: true
|
||||||
|
namespace: freeleaps-monitoring-system
|
||||||
|
labels:
|
||||||
|
release: kube-prometheus-stack
|
||||||
|
rules:
|
||||||
|
- alert: FreeleapsDevsvcServiceDown
|
||||||
|
expr: up{job="devsvc-service"} == 0
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
service: devsvc-service
|
||||||
|
annotations:
|
||||||
|
summary: Freeleaps Devsvc service is down (instance {{ $labels.instance }})
|
||||||
|
description: Freeleaps Devsvc service has been down for more than 1 minutes.
|
||||||
|
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||||
|
- alert: FreeleapsDevsvcServiceHighErrorRate
|
||||||
|
expr: rate(http_requests_total{job="devsvc-service",status=~"5.."}[5m]) > 0.1
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
service: devsvc-service
|
||||||
|
annotations:
|
||||||
|
summary: High error rate in freeleaps devsvc service (instance {{ $labels.instance }})
|
||||||
|
description: Freeleaps Devsvc service error rate is {{ $value }} errors per second.
|
||||||
|
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||||
|
|||||||
@ -0,0 +1,37 @@
|
|||||||
|
{{- /*
|
||||||
|
Copyright Broadcom, Inc. All Rights Reserved.
|
||||||
|
SPDX-License-Identifier: APACHE-2.0
|
||||||
|
*/}}
|
||||||
|
|
||||||
|
{{- if .Values.freeleaps.prometheusRule.enabled }}
|
||||||
|
apiVersion: monitoring.coreos.com/v1
|
||||||
|
kind: PrometheusRule
|
||||||
|
metadata:
|
||||||
|
name: {{ .Values.freeleaps.prometheusRule.name }}
|
||||||
|
namespace: {{ .Values.freeleaps.prometheusRule.namespace | quote }}
|
||||||
|
{{- with .Values.freeleaps.prometheusRule.labels }}
|
||||||
|
labels:
|
||||||
|
{{- toYaml . | nindent 4 }}
|
||||||
|
{{- end }}
|
||||||
|
spec:
|
||||||
|
groups:
|
||||||
|
{{- with .Values.freeleaps.prometheusRule.rules }}
|
||||||
|
- name: {{ $.Values.freeleaps.prometheusRule.name }}
|
||||||
|
rules:
|
||||||
|
{{- range . }}
|
||||||
|
- alert: {{ .alert }}
|
||||||
|
expr: {{ .expr | quote }}
|
||||||
|
{{- if .for }}
|
||||||
|
for: {{ .for }}
|
||||||
|
{{- end }}
|
||||||
|
{{- if .labels }}
|
||||||
|
labels:
|
||||||
|
{{- toYaml .labels | nindent 12 }}
|
||||||
|
{{- end }}
|
||||||
|
{{- if .annotations }}
|
||||||
|
annotations:
|
||||||
|
{{- toYaml .annotations | nindent 12 }}
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
||||||
@ -141,3 +141,30 @@ freeleaps:
|
|||||||
controlledResources:
|
controlledResources:
|
||||||
- cpu
|
- cpu
|
||||||
- memory
|
- memory
|
||||||
|
prometheusRule:
|
||||||
|
name: freepeals-alpha-freeleaps
|
||||||
|
enabled: false
|
||||||
|
namespace: freeleaps-monitoring-system
|
||||||
|
labels:
|
||||||
|
release: kube-prometheus-stack
|
||||||
|
rules:
|
||||||
|
- alert: FreeleapsFreeleapsServiceDown
|
||||||
|
expr: up{job="freeleaps-service"} == 0
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
service: freeleaps-service
|
||||||
|
annotations:
|
||||||
|
summary: Freeleaps Freeleaps service is down (instance {{ $labels.instance }})
|
||||||
|
description: Freeleaps Freeleaps service has been down for more than 1 minutes.
|
||||||
|
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||||
|
- alert: FreeleapsFreeleapsServiceHighErrorRate
|
||||||
|
expr: rate(http_requests_total{job="devops-service",status=~"5.."}[5m]) > 0.1
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
service: freeleaps-service
|
||||||
|
annotations:
|
||||||
|
summary: High error rate in freeleaps freeleaps service (instance {{ $labels.instance }})
|
||||||
|
description: Freeleaps Freeleaps service error rate is {{ $value }} errors per second.
|
||||||
|
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||||
|
|||||||
@ -132,3 +132,30 @@ freeleaps:
|
|||||||
controlledResources:
|
controlledResources:
|
||||||
- cpu
|
- cpu
|
||||||
- memory
|
- memory
|
||||||
|
prometheusRule:
|
||||||
|
name: freepeals-prod-freeleaps
|
||||||
|
enabled: true
|
||||||
|
namespace: freeleaps-monitoring-system
|
||||||
|
labels:
|
||||||
|
release: kube-prometheus-stack
|
||||||
|
rules:
|
||||||
|
- alert: FreeleapsFreeleapsServiceDown
|
||||||
|
expr: up{job="freeleaps-service"} == 0
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
service: freeleaps-service
|
||||||
|
annotations:
|
||||||
|
summary: Freeleaps Freeleaps service is down (instance {{ $labels.instance }})
|
||||||
|
description: Freeleaps Freeleaps service has been down for more than 1 minutes.
|
||||||
|
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||||
|
- alert: FreeleapsFreeleapsServiceHighErrorRate
|
||||||
|
expr: rate(http_requests_total{job="freeleaps-service",status=~"5.."}[5m]) > 0.1
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
service: freeleaps-service
|
||||||
|
annotations:
|
||||||
|
summary: High error rate in freeleaps freeleaps service (instance {{ $labels.instance }})
|
||||||
|
description: Freeleaps Freeleaps service error rate is {{ $value }} errors per second.
|
||||||
|
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||||
|
|||||||
@ -0,0 +1,37 @@
|
|||||||
|
{{- /*
|
||||||
|
Copyright Broadcom, Inc. All Rights Reserved.
|
||||||
|
SPDX-License-Identifier: APACHE-2.0
|
||||||
|
*/}}
|
||||||
|
|
||||||
|
{{- if .Values.notification.prometheusRule.enabled }}
|
||||||
|
apiVersion: monitoring.coreos.com/v1
|
||||||
|
kind: PrometheusRule
|
||||||
|
metadata:
|
||||||
|
name: {{ .Values.notification.prometheusRule.name }}
|
||||||
|
namespace: {{ .Values.notification.prometheusRule.namespace | quote }}
|
||||||
|
{{- with .Values.notification.prometheusRule.labels }}
|
||||||
|
labels:
|
||||||
|
{{- toYaml . | nindent 4 }}
|
||||||
|
{{- end }}
|
||||||
|
spec:
|
||||||
|
groups:
|
||||||
|
{{- with .Values.notification.prometheusRule.rules }}
|
||||||
|
- name: {{ $.Values.notification.prometheusRule.name }}
|
||||||
|
rules:
|
||||||
|
{{- range . }}
|
||||||
|
- alert: {{ .alert }}
|
||||||
|
expr: {{ .expr | quote }}
|
||||||
|
{{- if .for }}
|
||||||
|
for: {{ .for }}
|
||||||
|
{{- end }}
|
||||||
|
{{- if .labels }}
|
||||||
|
labels:
|
||||||
|
{{- toYaml .labels | nindent 12 }}
|
||||||
|
{{- end }}
|
||||||
|
{{- if .annotations }}
|
||||||
|
annotations:
|
||||||
|
{{- toYaml .annotations | nindent 12 }}
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
||||||
@ -143,3 +143,30 @@ notification:
|
|||||||
remoteRef:
|
remoteRef:
|
||||||
key: freeleaps-alpha-twilio-auth-token
|
key: freeleaps-alpha-twilio-auth-token
|
||||||
type: Secret
|
type: Secret
|
||||||
|
prometheusRule:
|
||||||
|
name: freepeals-alpha-notification
|
||||||
|
enabled: false
|
||||||
|
namespace: freeleaps-monitoring-system
|
||||||
|
labels:
|
||||||
|
release: kube-prometheus-stack
|
||||||
|
rules:
|
||||||
|
- alert: FreeleapsNotificationServiceDown
|
||||||
|
expr: up{job="notification-service"} == 0
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
service: notification-service
|
||||||
|
annotations:
|
||||||
|
summary: Freeleaps Notification service is down (instance {{ $labels.instance }})
|
||||||
|
description: Freeleaps Notification service has been down for more than 1 minutes.
|
||||||
|
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||||
|
- alert: FreeleapsNotificationServiceHighErrorRate
|
||||||
|
expr: rate(http_requests_total{job="notification-service",status=~"5.."}[5m]) > 0.1
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
service: notification-service
|
||||||
|
annotations:
|
||||||
|
summary: High error rate in freeleaps notification service (instance {{ $labels.instance }})
|
||||||
|
description: Freeleaps Notification service error rate is {{ $value }} errors per second.
|
||||||
|
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||||
|
|||||||
@ -129,3 +129,30 @@ notification:
|
|||||||
controlledResources:
|
controlledResources:
|
||||||
- cpu
|
- cpu
|
||||||
- memory
|
- memory
|
||||||
|
prometheusRule:
|
||||||
|
name: freepeals-prod-notification
|
||||||
|
enabled: true
|
||||||
|
namespace: freeleaps-monitoring-system
|
||||||
|
labels:
|
||||||
|
release: kube-prometheus-stack
|
||||||
|
rules:
|
||||||
|
- alert: FreeleapsNotificationServiceDown
|
||||||
|
expr: up{job="notification-service"} == 0
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
service: notification-service
|
||||||
|
annotations:
|
||||||
|
summary: Freeleaps Notification service is down (instance {{ $labels.instance }})
|
||||||
|
description: Freeleaps Notification service has been down for more than 1 minutes.
|
||||||
|
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||||
|
- alert: FreeleapsNotificationServiceHighErrorRate
|
||||||
|
expr: rate(http_requests_total{job="notification-service",status=~"5.."}[5m]) > 0.1
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
service: notification-service
|
||||||
|
annotations:
|
||||||
|
summary: High error rate in freeleaps notification service (instance {{ $labels.instance }})
|
||||||
|
description: Freeleaps Notification service error rate is {{ $value }} errors per second.
|
||||||
|
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||||
|
|||||||
@ -0,0 +1,37 @@
|
|||||||
|
{{- /*
|
||||||
|
Copyright Broadcom, Inc. All Rights Reserved.
|
||||||
|
SPDX-License-Identifier: APACHE-2.0
|
||||||
|
*/}}
|
||||||
|
|
||||||
|
{{- if .Values.payment.prometheusRule.enabled }}
|
||||||
|
apiVersion: monitoring.coreos.com/v1
|
||||||
|
kind: PrometheusRule
|
||||||
|
metadata:
|
||||||
|
name: {{ .Values.payment.prometheusRule.name }}
|
||||||
|
namespace: {{ .Values.payment.prometheusRule.namespace | quote }}
|
||||||
|
{{- with .Values.payment.prometheusRule.labels }}
|
||||||
|
labels:
|
||||||
|
{{- toYaml . | nindent 4 }}
|
||||||
|
{{- end }}
|
||||||
|
spec:
|
||||||
|
groups:
|
||||||
|
{{- with .Values.payment.prometheusRule.rules }}
|
||||||
|
- name: {{ $.Values.payment.prometheusRule.name }}
|
||||||
|
rules:
|
||||||
|
{{- range . }}
|
||||||
|
- alert: {{ .alert }}
|
||||||
|
expr: {{ .expr | quote }}
|
||||||
|
{{- if .for }}
|
||||||
|
for: {{ .for }}
|
||||||
|
{{- end }}
|
||||||
|
{{- if .labels }}
|
||||||
|
labels:
|
||||||
|
{{- toYaml .labels | nindent 12 }}
|
||||||
|
{{- end }}
|
||||||
|
{{- if .annotations }}
|
||||||
|
annotations:
|
||||||
|
{{- toYaml .annotations | nindent 12 }}
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
||||||
@ -115,3 +115,30 @@ payment:
|
|||||||
controlledResources:
|
controlledResources:
|
||||||
- cpu
|
- cpu
|
||||||
- memory
|
- memory
|
||||||
|
prometheusRule:
|
||||||
|
name: freepeals-alpha-payment
|
||||||
|
enabled: false
|
||||||
|
namespace: freeleaps-monitoring-system
|
||||||
|
labels:
|
||||||
|
release: kube-prometheus-stack
|
||||||
|
rules:
|
||||||
|
- alert: FreeleapsPaymentServiceDown
|
||||||
|
expr: up{job="payment-service"} == 0
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
service: payment-service
|
||||||
|
annotations:
|
||||||
|
summary: Freeleaps Payment service is down (instance {{ $labels.instance }})
|
||||||
|
description: Freeleaps Payment service has been down for more than 1 minutes.
|
||||||
|
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||||
|
- alert: FreeleapsPaymentServiceHighErrorRate
|
||||||
|
expr: rate(http_requests_total{job="payment-service",status=~"5.."}[5m]) > 0.1
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
service: payment-service
|
||||||
|
annotations:
|
||||||
|
summary: High error rate in freeleaps payment service (instance {{ $labels.instance }})
|
||||||
|
description: Freeleaps Payment service error rate is {{ $value }} errors per second.
|
||||||
|
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||||
|
|||||||
@ -106,3 +106,30 @@ payment:
|
|||||||
controlledResources:
|
controlledResources:
|
||||||
- cpu
|
- cpu
|
||||||
- memory
|
- memory
|
||||||
|
prometheusRule:
|
||||||
|
name: freepeals-prod-payment
|
||||||
|
enabled: true
|
||||||
|
namespace: freeleaps-monitoring-system
|
||||||
|
labels:
|
||||||
|
release: kube-prometheus-stack
|
||||||
|
rules:
|
||||||
|
- alert: FreeleapsPaymentServiceDown
|
||||||
|
expr: up{job="payment-service"} == 0
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
service: payment-service
|
||||||
|
annotations:
|
||||||
|
summary: Freeleaps Payment service is down (instance {{ $labels.instance }})
|
||||||
|
description: Freeleaps Payment service has been down for more than 1 minutes.
|
||||||
|
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||||
|
- alert: FreeleapsPaymentServiceHighErrorRate
|
||||||
|
expr: rate(http_requests_total{job="payment-service",status=~"5.."}[5m]) > 0.1
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
service: payment-service
|
||||||
|
annotations:
|
||||||
|
summary: High error rate in freeleaps payment service (instance {{ $labels.instance }})
|
||||||
|
description: Freeleaps Payment service error rate is {{ $value }} errors per second.
|
||||||
|
runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user