global: registry: docker.io repository: freeleaps nodeSelector: {} logIngest: enabled: false dashboard: enabled: false name: freeleaps-metrics-dashboard title: Freeleaps Metrics Dashboard metrics: replicas: 1 image: registry: docker.io repository: null name: metrics tag: "1.0.0" imagePullPolicy: IfNotPresent ports: - name: http containerPort: 8009 protocol: TCP resources: requests: cpu: '0.1' memory: 64Mi limits: cpu: '0.2' memory: 128Mi probes: liveness: type: httpGet config: path: /api/_/livez port: 8009 initialDelaySeconds: 30 periodSeconds: 10 timeoutSeconds: 10 successThreshold: 1 failureThreshold: 5 terminationGracePeriodSeconds: 30 readiness: type: httpGet config: path: /api/_/readyz port: 8009 initialDelaySeconds: 30 periodSeconds: 10 timeoutSeconds: 10 successThreshold: 1 failureThreshold: 5 services: - name: metrics-service type: ClusterIP port: 8009 targetPort: 8009 serviceMonitor: enabled: true labels: release: kube-prometheus-stack namespace: freeleaps-monitoring-system interval: 30s scrapeTimeout: 10s configs: starrocksHost: "" starrocksPort: 8009 starrocksUser: "" starrocksPassword: "" starrocksDatabase: "" prometheusEndpoint: "" vpa: minAllowed: enabled: false cpu: 100m memory: 64Mi maxAllowed: enabled: true cpu: 100m memory: 128Mi controlledResources: - cpu - memory prometheusRule: name: freepeals-metrics enabled: true namespace: "freeleaps-monitoring-system" labels: release: kube-prometheus-stack rules: - alert: FreeleapsMetricsServiceDown expr: up{job="metrics-service"} == 0 for: 1m labels: severity: critical service: metrics-service annotations: summary: "Freeleaps Metrics service is down (instance {{ $labels.instance }})" description: "Freeleaps Metrics service has been down for more than 1 minutes." runbook_url: "https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7" - alert: FreeleapsMetricsServiceHighErrorRate expr: rate(http_requests_total{job="metrics-service",status=~"5.."}[5m]) > 0.1 for: 5m labels: severity: warning service: metrics-service annotations: summary: "High error rate in freeleaps metrics service (instance {{ $labels.instance }})" description: "Freeleaps Metrics service error rate is {{ $value }} errors per second." runbook_url: "https://netorgft10898514.sharepoint.com/:w:/s/FreeleapsEngineeringTeam/EUlvzumTsPxCpPAzI3gm9OIB0DCLTjQzzYVL6VsHYZFjxg?e=0dxVr7" # - alert: MetricsServiceHighLatency # expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{job="metrics-service"}[5m])) > 1 # for: 5m # labels: # severity: warning # service: metrics-service # annotations: # summary: "High latency in metrics service (instance {{ $labels.instance }})" # description: "95th percentile latency is {{ $value }} seconds." # - alert: MetricsServiceHighMemoryUsage # expr: (process_resident_memory_bytes{job="metrics-service"} / 1024 / 1024) > 512 # for: 5m # labels: # severity: warning # service: metrics # annotations: # summary: "High memory usage in metrics service (instance {{ $labels.instance }})" # description: "Memory usage is {{ $value }} MB." # - alert: MetricsServiceHighCPUUsage # expr: rate(process_cpu_seconds_total{job="metrics-service"}[5m]) * 100 > 80 # for: 5m # labels: # severity: warning # service: metrics # annotations: # summary: "High CPU usage in metrics service (instance {{ $labels.instance }})" # description: "CPU usage is {{ $value }}%." # - alert: MetricsServiceNoData # expr: absent(up{job="metrics-service"}) # for: 5m # labels: # severity: critical # service: metrics # annotations: # summary: "No data from metrics service (instance {{ $labels.instance }})" # description: "No metrics data received from metrics service for more than 5 minutes."