diff --git a/freeleaps/helm-pkg/metrics/values.prod.yaml b/freeleaps/helm-pkg/metrics/values.prod.yaml index 618c52a2..6a1ad708 100644 --- a/freeleaps/helm-pkg/metrics/values.prod.yaml +++ b/freeleaps/helm-pkg/metrics/values.prod.yaml @@ -14,8 +14,8 @@ metrics: image: registry: docker.io repository: null - name: metrics - tag: snapshot-38ff0ae + name: devops + tag: snapshot-ee519ca imagePullPolicy: IfNotPresent ports: - name: http @@ -28,7 +28,7 @@ metrics: limits: cpu: '0.2' memory: 128Mi - probes: + probes: liveness: type: httpGet config: @@ -84,26 +84,25 @@ metrics: prometheusRule: name: freepeals-prod-metrics enabled: true - namespace: "freeleaps-monitoring-system" + namespace: freeleaps-monitoring-system rules: - - alert: FreeleapsMetricsServiceDown - expr: up{job="metrics-service"} == 0 - for: 1m - labels: - severity: critical - service: metrics-service - annotations: - summary: "Freeleaps Metrics service is down (instance {{ $labels.instance }})" - description: "Freeleaps Metrics service has been down for more than 1 minutes." - runbook_url: "https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7" - - - alert: FreeleapsMetricsServiceHighErrorRate - expr: rate(http_requests_total{job="metrics-service",status=~"5.."}[5m]) > 0.1 - for: 5m - labels: - severity: warning - service: metrics-service - annotations: - summary: "High error rate in freeleaps metrics service (instance {{ $labels.instance }})" - description: "Freeleaps Metrics service error rate is {{ $value }} errors per second." - runbook_url: "https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7" \ No newline at end of file + - alert: FreeleapsMetricsServiceDown + expr: up{job="metrics-service"} == 0 + for: 1m + labels: + severity: critical + service: metrics-service + annotations: + summary: Freeleaps Metrics service is down (instance {{ $labels.instance }}) + description: Freeleaps Metrics service has been down for more than 1 minutes. + runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7 + - alert: FreeleapsMetricsServiceHighErrorRate + expr: rate(http_requests_total{job="metrics-service",status=~"5.."}[5m]) > 0.1 + for: 5m + labels: + severity: warning + service: metrics-service + annotations: + summary: High error rate in freeleaps metrics service (instance {{ $labels.instance }}) + description: Freeleaps Metrics service error rate is {{ $value }} errors per second. + runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7