diff --git a/docs/Service Monitor and Error Alter Integration Guideline.md b/docs/Service Monitor and Error Alter Integration Guideline.md index 0d9369b9..a45c2353 100644 --- a/docs/Service Monitor and Error Alter Integration Guideline.md +++ b/docs/Service Monitor and Error Alter Integration Guideline.md @@ -1,32 +1,91 @@ -# Prometheus Alter Rule Config +# 1. Prometheus Alert Rule Configuration -Add `prometheusrule.yaml` to `/templates`. -see -``` +## 1.1. Add `prometheusrule.yaml` to `/templates`. + +Example: + +> Update metrics to your service name, see freeleaps-ops/freeleaps/helm-pkg/metrics +```yaml {{- /* Copyright Broadcom, Inc. All Rights Reserved. SPDX-License-Identifier: APACHE-2.0 */}} -{{- if and .Values.metrics.enabled .Values.metrics.prometheusRule.enabled }} +{{- if .Values.metrics.prometheusRule.enabled }} apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: - name: {{ include "common.names.fullname" . }} - namespace: {{ default (include "common.names.namespace" .) .Values.metrics.prometheusRule.namespace | quote}} - labels: {{- include "common.labels.standard" ( dict "customLabels" .Values.commonLabels "context" $ ) | nindent 4 }} - {{- if .Values.metrics.prometheusRule.additionalLabels }} - {{- include "common.tplvalues.render" (dict "value" .Values.metrics.prometheusRule.additionalLabels "context" $) | nindent 4 }} - {{- end }} - {{- if .Values.commonAnnotations }} - annotations: {{- include "common.tplvalues.render" ( dict "value" .Values.commonAnnotations "context" $ ) | nindent 4 }} + name: {{ .Values.metrics.prometheusRule.name }} + namespace: {{ .Values.metrics.prometheusRule.namespace | quote }} + {{- with .Values.metrics.prometheusRule.labels }} + labels: + {{- toYaml . | nindent 4 }} {{- end }} spec: groups: {{- with .Values.metrics.prometheusRule.rules }} - - name: {{ template "common.names.name" $ }} - rules: {{- include "common.tplvalues.render" (dict "value" . "context" $) | nindent 8 }} + - name: {{ $.Values.metrics.prometheusRule.name }} + rules: + {{- range . }} + - alert: {{ .alert }} + expr: {{ .expr | quote }} + {{- if .for }} + for: {{ .for }} + {{- end }} + {{- if .labels }} + labels: + {{- toYaml .labels | nindent 12 }} + {{- end }} + {{- if .annotations }} + annotations: + {{- toYaml .annotations | nindent 12 }} + {{- end }} + {{- end }} {{- end }} {{- end }} +``` -``` \ No newline at end of file +## 1.2. Add prometheusrule configuration to values.{alpha/prod}.yaml +Example: + +> See freeleaps-ops/freeleaps/helm-pkg/metrics + +```yaml +prometheusRule: + name: freepeals-metrics + enabled: true + namespace: "freeleaps-monitoring-system" + labels: + release: kube-prometheus-stack + rules: + - alert: FreeleapsMetricsServiceDown + expr: up{job="metrics-service"} == 0 + for: 1m + labels: + severity: critical + service: metrics-service + annotations: + summary: "Freeleaps Metrics service is down (instance {{ $labels.instance }})" + description: "Freeleaps Metrics service has been down for more than 1 minutes." + runbook_url: "https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7" + + - alert: FreeleapsMetricsServiceHighErrorRate + expr: rate(http_requests_total{job="metrics-service",status=~"5.."}[5m]) > 0.1 + for: 5m + labels: + severity: warning + service: metrics-service + annotations: + summary: "High error rate in freeleaps metrics service (instance {{ $labels.instance }})" + description: "Freeleaps Metrics service error rate is {{ $value }} errors per second." + runbook_url: "https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7" +``` + +## 1.3. Verify Alert Rule Configuration is Effective + +> Redirect to local +![alt text](asserts/image4.png) + +> You can see the newly added rules indicating they are effective + +![alt text](asserts/image5.png) \ No newline at end of file diff --git a/docs/asserts/image4.png b/docs/asserts/image4.png new file mode 100644 index 00000000..c887606e Binary files /dev/null and b/docs/asserts/image4.png differ diff --git a/docs/asserts/image5.png b/docs/asserts/image5.png new file mode 100644 index 00000000..45cc0088 Binary files /dev/null and b/docs/asserts/image5.png differ diff --git a/freeleaps/helm-pkg/3rd/rabbitmq/values.alpha.yaml b/freeleaps/helm-pkg/3rd/rabbitmq/values.alpha.yaml index 60b325d8..a43b3a7e 100644 --- a/freeleaps/helm-pkg/3rd/rabbitmq/values.alpha.yaml +++ b/freeleaps/helm-pkg/3rd/rabbitmq/values.alpha.yaml @@ -25,6 +25,7 @@ auth: securePassword: true updatePassword: false enableLoopbackUser: false + erlangCookie: "iGF4ZVjeaXgHW2xELZTxJl8a6aFY7nes" tls: enabled: false logs: "-" diff --git a/freeleaps/helm-pkg/3rd/rabbitmq/values.prod.yaml b/freeleaps/helm-pkg/3rd/rabbitmq/values.prod.yaml index e6e234bb..32143f4e 100644 --- a/freeleaps/helm-pkg/3rd/rabbitmq/values.prod.yaml +++ b/freeleaps/helm-pkg/3rd/rabbitmq/values.prod.yaml @@ -162,6 +162,7 @@ auth: ## ref: https://github.com/bitnami/containers/tree/main/bitnami/rabbitmq#environment-variables ## updatePassword: false + ## @param auth.existingPasswordSecret Existing secret with RabbitMQ credentials (existing secret must contain a value for `rabbitmq-password` key or override with setting auth.existingSecretPasswordKey) ## e.g: ## existingPasswordSecret: name-of-existing-secret @@ -177,7 +178,7 @@ auth: ## @param auth.erlangCookie Erlang cookie to determine whether different nodes are allowed to communicate with each other ## ref: https://github.com/bitnami/containers/tree/main/bitnami/rabbitmq#environment-variables ## - erlangCookie: "" + erlangCookie: "iGF4ZVjeaXgHW2xELZTxJl8a6aFY7nes" ## @param auth.existingErlangSecret Existing secret with RabbitMQ Erlang cookie (must contain a value for `rabbitmq-erlang-cookie` key or override with auth.existingSecretErlangKey) ## e.g: ## existingErlangSecret: name-of-existing-secret diff --git a/freeleaps/helm-pkg/authentication/values.prod.yaml b/freeleaps/helm-pkg/authentication/values.prod.yaml index b8807e6b..83794fd4 100644 --- a/freeleaps/helm-pkg/authentication/values.prod.yaml +++ b/freeleaps/helm-pkg/authentication/values.prod.yaml @@ -1,7 +1,3 @@ -global: - registry: docker.io - repository: freeleaps - nodeSelector: {} dashboard: enabled: true name: freeleaps-prod-authentication-dashboard @@ -20,7 +16,7 @@ authentication: registry: docker.io repository: null name: authentication - tag: 1.9.0 + tag: snapshot-ee519ca imagePullPolicy: IfNotPresent ports: - name: http diff --git a/freeleaps/helm-pkg/content/values.prod.yaml b/freeleaps/helm-pkg/content/values.prod.yaml index 95a92848..dad4ffd4 100644 --- a/freeleaps/helm-pkg/content/values.prod.yaml +++ b/freeleaps/helm-pkg/content/values.prod.yaml @@ -18,7 +18,7 @@ content: registry: docker.io repository: null name: content - tag: 1.9.0 + tag: snapshot-ee519ca imagePullPolicy: IfNotPresent ports: - name: http diff --git a/freeleaps/helm-pkg/devops/values.alpha.yaml b/freeleaps/helm-pkg/devops/values.alpha.yaml index 4cc64629..1a2d7cb8 100644 --- a/freeleaps/helm-pkg/devops/values.alpha.yaml +++ b/freeleaps/helm-pkg/devops/values.alpha.yaml @@ -10,7 +10,7 @@ devops: registry: docker.io repository: null name: devops - tag: snapshot-96f2f52 + tag: snapshot-3cba9e4 imagePullPolicy: IfNotPresent ports: - name: http diff --git a/freeleaps/helm-pkg/metrics/values.alpha.yaml b/freeleaps/helm-pkg/metrics/values.alpha.yaml index 823193c9..aecb20d5 100644 --- a/freeleaps/helm-pkg/metrics/values.alpha.yaml +++ b/freeleaps/helm-pkg/metrics/values.alpha.yaml @@ -84,7 +84,9 @@ metrics: prometheusRule: name: freepeals-alpha-metrics enabled: false - namespace: freeleaps-monitoring-system + namespace: "freeleaps-monitoring-system" + labels: + release: kube-prometheus-stack rules: - alert: FreeleapsMetricsServiceDown expr: up{job="metrics-service"} == 0 diff --git a/freeleaps/helm-pkg/metrics/values.prod.yaml b/freeleaps/helm-pkg/metrics/values.prod.yaml index 618c52a2..d94a16b0 100644 --- a/freeleaps/helm-pkg/metrics/values.prod.yaml +++ b/freeleaps/helm-pkg/metrics/values.prod.yaml @@ -15,7 +15,7 @@ metrics: registry: docker.io repository: null name: metrics - tag: snapshot-38ff0ae + tag: snapshot-3cba9e4 imagePullPolicy: IfNotPresent ports: - name: http @@ -28,7 +28,7 @@ metrics: limits: cpu: '0.2' memory: 128Mi - probes: + probes: liveness: type: httpGet config: @@ -85,25 +85,26 @@ metrics: name: freepeals-prod-metrics enabled: true namespace: "freeleaps-monitoring-system" + labels: + release: kube-prometheus-stack rules: - - alert: FreeleapsMetricsServiceDown - expr: up{job="metrics-service"} == 0 - for: 1m - labels: - severity: critical - service: metrics-service - annotations: - summary: "Freeleaps Metrics service is down (instance {{ $labels.instance }})" - description: "Freeleaps Metrics service has been down for more than 1 minutes." - runbook_url: "https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7" - - - alert: FreeleapsMetricsServiceHighErrorRate - expr: rate(http_requests_total{job="metrics-service",status=~"5.."}[5m]) > 0.1 - for: 5m - labels: - severity: warning - service: metrics-service - annotations: - summary: "High error rate in freeleaps metrics service (instance {{ $labels.instance }})" - description: "Freeleaps Metrics service error rate is {{ $value }} errors per second." - runbook_url: "https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7" \ No newline at end of file + - alert: FreeleapsMetricsServiceDown + expr: up{job="metrics-service"} == 0 + for: 1m + labels: + severity: critical + service: metrics-service + annotations: + summary: Freeleaps Metrics service is down (instance {{ $labels.instance }}) + description: Freeleaps Metrics service has been down for more than 1 minutes. + runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7 + - alert: FreeleapsMetricsServiceHighErrorRate + expr: rate(http_requests_total{job="metrics-service",status=~"5.."}[5m]) > 0.1 + for: 5m + labels: + severity: warning + service: metrics-service + annotations: + summary: High error rate in freeleaps metrics service (instance {{ $labels.instance }}) + description: Freeleaps Metrics service error rate is {{ $value }} errors per second. + runbook_url: https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7 diff --git a/freeleaps/helm-pkg/metrics/values.yaml b/freeleaps/helm-pkg/metrics/values.yaml index 5a1d5c8f..c50de0fa 100644 --- a/freeleaps/helm-pkg/metrics/values.yaml +++ b/freeleaps/helm-pkg/metrics/values.yaml @@ -84,6 +84,8 @@ metrics: name: freepeals-metrics enabled: true namespace: "freeleaps-monitoring-system" + labels: + release: kube-prometheus-stack rules: - alert: FreeleapsMetricsServiceDown expr: up{job="metrics-service"} == 0 diff --git a/freeleaps/helm-pkg/notification/values.alpha.yaml b/freeleaps/helm-pkg/notification/values.alpha.yaml index 291ada4e..1c23b375 100644 --- a/freeleaps/helm-pkg/notification/values.alpha.yaml +++ b/freeleaps/helm-pkg/notification/values.alpha.yaml @@ -32,23 +32,23 @@ notification: readiness: type: httpGet config: - path: /api/_/readyz + path: /api/_/livez port: 8003 - initialDelaySeconds: 30 + initialDelaySeconds: 60 periodSeconds: 30 timeoutSeconds: 3 successThreshold: 1 - failureThreshold: 3 + failureThreshold: 5 liveness: type: httpGet config: path: /api/_/livez port: 8003 - initialDelaySeconds: 30 - periodSeconds: 15 + initialDelaySeconds: 60 + periodSeconds: 30 timeoutSeconds: 3 successThreshold: 1 - failureThreshold: 3 + failureThreshold: 5 terminationGracePeriodSeconds: 30 services: - name: notification-service diff --git a/freeleaps/helm-pkg/notification/values.prod.yaml b/freeleaps/helm-pkg/notification/values.prod.yaml index d8e0b2f0..6db3b7bf 100644 --- a/freeleaps/helm-pkg/notification/values.prod.yaml +++ b/freeleaps/helm-pkg/notification/values.prod.yaml @@ -13,7 +13,7 @@ notification: registry: docker.io repository: null name: notification - tag: 1.9.0 + tag: snapshot-ee519ca imagePullPolicy: IfNotPresent ports: - name: http @@ -30,23 +30,23 @@ notification: readiness: type: httpGet config: - path: /api/_/readyz + path: /api/_/livez port: 8003 - initialDelaySeconds: 30 + initialDelaySeconds: 60 periodSeconds: 30 timeoutSeconds: 3 successThreshold: 1 - failureThreshold: 3 + failureThreshold: 5 liveness: type: httpGet config: path: /api/_/livez port: 8003 - initialDelaySeconds: 30 - periodSeconds: 15 + initialDelaySeconds: 60 + periodSeconds: 30 timeoutSeconds: 3 successThreshold: 1 - failureThreshold: 3 + failureThreshold: 5 terminationGracePeriodSeconds: 30 services: - name: notification-service diff --git a/freeleaps/prod/ci/freeleaps-service-hub/Jenkinsfile b/freeleaps/prod/ci/freeleaps-service-hub/Jenkinsfile index c1faec17..84e557e8 100644 --- a/freeleaps/prod/ci/freeleaps-service-hub/Jenkinsfile +++ b/freeleaps/prod/ci/freeleaps-service-hub/Jenkinsfile @@ -23,7 +23,7 @@ executeFreeleapsPipeline { sastEnabled: false, imageRegistry: 'docker.io', imageRepository: 'freeleaps', - imageName: 'devops', + imageName: 'metrics', imageBuilder: 'dind', dockerfilePath: 'Dockerfile', imageBuildRoot: '.',