Istio & ArgoCD Alertmanager PrometheusRules

From Fyzix
Revision as of 13:51, 17 September 2020 by Fyzix (talk | contribs) (ArgoCD)
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)
Jump to navigation Jump to search

Prometheus rules for Istio and ArgoCD. Works in conjunction with Alert Manager.

Istio

Reference: https://discuss.istio.io/t/prometheus-alerting-on-istio-components/2167/18

apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
  name: istio-alerts
  namespace: istio-system
  labels:
    app: prometheus-operator # This may need to be customized.
    release: prometheus-operator # This may need to be customized
spec:
  groups:
  - name: istio
    rules:
    - alert: IstioPilotAvailabilityDrop
      annotations:
        summary: 'Istio Pilot Availability Drop'
        description: 'Pilot pods have dropped during the last 5m (current value: *{{ printf "%2.0f%%" $value }}*). Envoy sidecars might have outdated configuration'
      expr: >
        avg(avg_over_time(up{job="pilot"}[1m])) < 0.5
      for: 5m
    - alert: IstioMixerTelemetryAvailabilityDrop
      annotations:
        summary: 'Istio Mixer Telemetry Drop'
        description: 'Mixer pods have dropped during the last 5m (current value: *{{ printf "%2.0f%%" $value }}*). Istio metrics will not work correctly'
      expr: >
        avg(avg_over_time(up{job="mixer", service="istio-telemetry", endpoint="http-monitoring"}[5m])) < 0.5
      for: 5m
    - alert: IstioGalleyAvailabilityDrop
      annotations:
        summary: 'Istio Galley Availability Drop'
        description: 'Galley pods have dropped during the last 5m (current value: *{{ printf "%2.0f%%" $value }}*). Istio config ingestion and processing will not work'
      expr: >
        avg(avg_over_time(up{job="galley"}[5m])) < 0.5
      for: 5m
    - alert: IstioGatewayAvailabilityDrop
      annotations:
        summary: 'Istio Gateway Availability Drop'
        description: 'Gateway pods have dropped during the last 5m (current value: *{{ printf "%2.0f%%" $value }}*). Inbound traffic will likely be affected'
      expr: >
        min(kube_deployment_status_replicas_available{deployment="istio-ingressgateway", namespace="istio-system"}) without (instance, pod) < 2
      for: 5m
    - alert: IstioPilotPushErrorsHigh
      annotations:
        summary: 'Number of Istio Pilot push errors is too high'
        description: 'Pilot has too many push errors during the last 5m (current value: *{{ printf "%2.0f%%" $value }}*). Envoy sidecars might have outdated configuration'
      expr: >
        sum(irate(pilot_xds_push_errors{job="pilot"}[5m])) / sum(irate(pilot_xds_pushes{job="pilot"}[5m])) > 0.05
      for: 5m
    - alert: IstioMixerPrometheusDispatchesLow
      annotations:
        summary: 'Number of Mixer dispatches to Prometheus is too low'
        description: 'Mixer disptaches to Prometheus has dropped below normal levels during the last 5m (current value: *{{ printf "%2.0f%%" $value }}*). Istio metrics might not be being exported properly'
      expr: >
        sum(irate(mixer_runtime_dispatches_total{adapter=~"prometheus"}[5m])) < 180
      for: 5m
    - alert: IstioGlobalRequestRateHigh
      annotations:
        summary: 'Istio Global Request Rate High'
        description: 'Istio global request rate is unusually high during the last 5m (current value: *{{ printf "%2.0f%%" $value }}*). The amount of traffic being generated inside the service mesh is higher than normal'
      expr: >
        round(sum(irate(istio_requests_total{reporter="destination"}[5m])), 0.001) > 1200
      for: 5m
    - alert: IstioGlobalRequestRateLow
      annotations:
        summary: 'Istio global request rate too low'
        description: 'Istio global request rate is unusually low during the last 5m (current value: *{{ printf "%2.0f%%" $value }}*). The amount of traffic being generated inside the service mesh has dropped below usual levels'
      expr: >
        round(sum(irate(istio_requests_total{reporter="destination"}[5m])), 0.001) < 300
      for: 5m
    - alert: IstioGlobalHTTP5xxRateHigh
      annotations:
        summary: 'Istio Percentage of HTTP 5xx responses is too high'
        description: 'Istio global HTTP 5xx rate is too high in last 5m (current value: *{{ printf "%2.0f%%" $value }}*). The HTTP 5xx errors within the service mesh is unusually high'
      expr: >
         sum(irate(istio_requests_total{reporter="destination", response_code=~"5.*"}[5m])) / sum(irate(istio_requests_total{reporter="destination"}[5m])) > 0.01
      for: 5m
    - alert: IstioGatewayOutgoingSuccessLow
      annotations:
        summary: 'Istio Gateway outgoing success rate is too low'
        description: 'Istio Gateway success to outbound destinations is too low in last 5m (current value: *{{ printf "%2.0f%%" $value }}*). Inbound traffic may be affected'
      expr: >
        sum(irate(istio_requests_total{reporter="source", source_workload="istio-ingressgateway",source_workload_namespace="istio-system", connection_security_policy!="mutual_tls",response_code!~"5.*"}[5m])) /  sum(irate(istio_requests_total{reporter="source", source_workload="istio-ingressgateway",source_workload_namespace="istio-system", connection_security_policy!="mutual_tls"}[5m])) < 0.995
      for: 5m

ArgoCD

apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
  name: argocd-alerts
  namespace: argocd
  labels:
    app: prometheus-operator # This may need to be customized.
    release: prometheus-operator # This may need to be customized
spec:
  groups:
  - name: argocd
    rules:
    - alert: "ArgoCDAppOutOfSync"
      expr: argocd_app_info{sync_status="OutOfSync"} > 0
      annotations:
        message: "ArgoCD Application {{ $labels.name }} is OutOfSync for longer than 5 minutes"
      for: 5m
      labels:
        severity: critical
    - alert: "ArgoCDAppProgressing"
      expr: argocd_app_info{health_status="Progressing"} > 0
      annotations:
        message: "ArgoCD Application {{ $labels.name }} Progressing longer than 5 minutes"
      for: 5m
      labels:
        severity: critical
    - alert: "ArgoCDAppUnknown"
      expr: argocd_app_info{health_status="Unknown"} > 0
      annotations:
        message: "ArgoCD Application {{ $labels.name }} Unknown for 1 minute"
      for: 1m
      labels:
        severity: critical
    - alert: "ArgoCDAppMissing"
      expr: argocd_app_info{health_status="Missing"} > 0
      annotations:
        message: "ArgoCD Application {{ $labels.name }} Missing for 5 minutes"
      for: 5m
      labels:
        severity: critical
    - alert: "ArgoCDAppDegraded"
      expr: argocd_app_info{health_status="Degraded"} > 0
      annotations:
        message: "ArgoCD Application {{ $labels.name }} Degraded for 5 minutes"
      for: 5m
      labels:
        severity: critical