From 6ff568501b204089ef93ab82436282148e5d0c0b Mon Sep 17 00:00:00 2001 From: Jayapriya Pai Date: Tue, 19 Nov 2024 13:48:02 +0530 Subject: [PATCH] MON-4059: Update TelemeterClientFailures alert Use the new `metricsclient_http_requests_total` metric which would tell the difference between 4xx errors (e.g. bad pull secret) and 5xx (issue on Red Hat side). Signed-off-by: Jayapriya Pai --- jsonnet/telemeter/client/kubernetes.libsonnet | 7 +------ manifests/client/prometheusRule.yaml | 7 +------ 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/jsonnet/telemeter/client/kubernetes.libsonnet b/jsonnet/telemeter/client/kubernetes.libsonnet index 445c39a42..be5606a46 100644 --- a/jsonnet/telemeter/client/kubernetes.libsonnet +++ b/jsonnet/telemeter/client/kubernetes.libsonnet @@ -253,12 +253,7 @@ local securePort = 8443; }, { expr: ||| - sum by (namespace) ( - rate(federate_requests_failed_total{job="telemeter-client"}[15m]) - ) / - sum by (namespace) ( - rate(federate_requests_total{job="telemeter-client"}[15m]) - ) > 0.2 + sum by(client, status_code) (rate(metricsclient_http_requests_total{status_code!~"200"}[15m])) > 0 |||, labels: { severity: 'warning', diff --git a/manifests/client/prometheusRule.yaml b/manifests/client/prometheusRule.yaml index 87a8761fd..99ab56941 100644 --- a/manifests/client/prometheusRule.yaml +++ b/manifests/client/prometheusRule.yaml @@ -18,12 +18,7 @@ spec: If the telemeter client fails to authenticate with the telemeter service, make sure that the global pull secret is up to date, see https://docs.openshift.com/container-platform/latest/openshift_images/managing_images/using-image-pull-secrets.html#images-update-global-pull-secret_using-image-pull-secrets for more details. summary: Telemeter client fails to send metrics expr: | - sum by (namespace) ( - rate(federate_requests_failed_total{job="telemeter-client"}[15m]) - ) / - sum by (namespace) ( - rate(federate_requests_total{job="telemeter-client"}[15m]) - ) > 0.2 + sum by(client, status_code) (rate(metricsclient_http_requests_total{status_code!~"200"}[15m])) > 0 for: 1h labels: severity: warning