Загрузка данных
dmitriev-aal@VDI-Dmitriev-A:~/Desktop/appfarm/infra/k8s/nexus$ git diff origin/master -- \
> deploy/files/alerts/nexus-common.rules \
> deploy/files/alerts/nexus-alerts.rules \
> deploy/files/alerts/nexus-blackbox-proxy.rules
index a03eabb..96cf831 100644
--- a/deploy/files/alerts/nexus-alerts.rules
+++ b/deploy/files/alerts/nexus-alerts.rules
@@ -2,9 +2,10 @@
rules:
- alert: CriticalNexusOrientDBFillingUP
annotations:
- description: The PersistentVolume claimed by {{`{{ $labels.persistentvolumeclaim }}`}} in Namespace {{`{{ $labels.namespace }}`}} is only {{`{{ $value | humanizePercentage }}`}} free.
- runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup
- summary: PersistentVolume is filling up.
+ description: "The PersistentVolume claimed by {{`{{ $labels.persistentvolumeclaim }}`}} in Namespace {{`{{ $labels.namespace }}`}} is only {{`{{ $value | humanizePercentage }}`}} free."
+ runbook_url: "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup"
+ summary: "PersistentVolume is filling up."
+ dashboard_url: "{{ .Values.prometheusRules.params.dashboard_url }}"
expr: |-
kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics", persistentvolumeclaim="nexus-nexus3-data"}
/
@@ -12,31 +13,54 @@
< 0.03
for: 1m
labels:
+ team: sre
+ service: nexus
+ component: storage
+ environment: production
severity: critical
+
- alert: WarningNexusOrientDBFillingUP
annotations:
- description: Based on recent sampling, the PersistentVolume claimed by {{`{{ $labels.persistentvolumeclaim }}`}} in Namespace {{`{{ $labels.namespace }}`}} is expected to fill up wi
thin four days. Currently {{`{{ $value | humanizePercentage }}`}} is available.
- runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup
- summary: PersistentVolume is filling up.
+ description: "Based on recent sampling, the PersistentVolume claimed by {{`{{ $labels.persistentvolumeclaim }}`}} in Namespace {{`{{ $labels.namespace }}`}} is expected to fill up w
ithin four days. Currently {{`{{ $value | humanizePercentage }}`}} is available."
+ runbook_url: "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup"
+ summary: "PersistentVolume is filling up."
+ dashboard_url: "{{ .Values.prometheusRules.params.dashboard_url }}"
expr: |-
- kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics", persistentvolumeclaim="nexus-nexus3-data"} /1024/1024/1024 < 6
+ kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics", persistentvolumeclaim="nexus-nexus3-data"} /1024/1024/1024 < 6
for: 1h
labels:
+ team: sre
+ service: nexus
+ component: storage
+ environment: production
severity: warning
+
- alert: CriticalNexusOrientDBFillingErrors
annotations:
- description: The persistent volume {{`{{ $labels.persistentvolume }}`}} has status {{`{{ $labels.phase }}`}}.
- runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors
- summary: PersistentVolume is having issues with provisioning.
+ description: "The persistent volume {{`{{ $labels.persistentvolume }}`}} has status {{`{{ $labels.phase }}`}}."
+ runbook_url: "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors"
+ summary: "PersistentVolume is having issues with provisioning."
+ dashboard_url: "{{ .Values.prometheusRules.params.dashboard_url }}"
expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics", persistentvolumeclaim="nexus-nexus3-data"} > 0
for: 5m
labels:
+ team: sre
+ service: nexus
+ component: storage
+ environment: production
severity: critical
- - alert: NexusTaskNotOK
+
+ - alert: NexusTaskFailures
annotations:
- description: Task with name {{`{{ $labels.name }}`}} was completed with is NOT OK status.
- summary: Check Nexus UI (System -> Tasks) and logs for investigating.
- expr: nexus_tasks_status{lastRunResult!~"OK|<nil>"} > 0
+ description: "Task with name {{`{{ $labels.name }}`}} was completed with NOT OK status {{`{{ $labels.lastRunResult }}`}}."
+ summary: "Check Nexus UI (System -> Tasks) and logs for investigating."
+ dashboard_url: "{{ .Values.prometheusRules.params.dashboard_url }}"
+ runbook_url: "{{ .Values.prometheusRules.params.runbook_url }}"
+ expr: nexus_tasks_status{lastRunResult!~"OK|<nil>"}
for: 1m
labels:
- severity: critical
+ team: sre
+ service: nexus
+ component: tasks
+ environment: production
+ severity: critical
\ No newline at end of file
diff --git a/deploy/files/alerts/nexus-blackbox-proxy.rules b/deploy/files/alerts/nexus-blackbox-proxy.rules
index 80f5744..c0ab151 100644
--- a/deploy/files/alerts/nexus-blackbox-proxy.rules
+++ b/deploy/files/alerts/nexus-blackbox-proxy.rules
@@ -1,78 +1,60 @@
- name: nexus-blackbox-proxy
rules:
- - alert: NexusProxyDown
- expr: probe_success{job="nexus-blackbox-proxy"} == 0
- for: 1m
- labels:
- severity: critical
- component: proxy
- service: blackbox
- annotations:
- summary: "Proxy connection failed for {{`{{ $labels.instance }}`}}"
- description: "Blackbox exporter failed to connect to {{`{{ $labels.instance }}`}} through proxy. The proxy might be down or the target is unreachable."
+ - alert: NexusProxyTargetDown
+ annotations:
+ summary: "Proxy target {{`{{ $labels.instance }}`}} is down."
+ description: "Blackbox probe through Nexus proxy failed for {{`{{ $labels.instance }}`}} for 3 minutes."
+ dashboard_url: "{{ .Values.prometheusRules.params.dashboard_url }}"
+ runbook_url: "{{ .Values.prometheusRules.params.runbook_url }}"
+ expr: probe_success{job="nexus-proxy-targets"} == 0
+ for: 3m
+ labels:
+ team: sre
+ service: nexus
+ component: proxy
+ environment: production
+ severity: critical
- - alert: NexusProxyHighLatency
- expr: probe_duration_seconds{job="nexus-blackbox-proxy"} > 5
- for: 5m
- labels:
- severity: warning
- component: proxy
- service: blackbox
- annotations:
- summary: "High latency detected for {{`{{ $labels.instance }}`}}"
- description: "Proxy request to {{`{{ $labels.instance }}`}} took {{`{{ $value }}`}}s (threshold: 5s). This might indicate network issues or proxy performance problems."
+ - alert: NexusProxyHighLatency
+ annotations:
+ summary: "High Nexus proxy latency for {{`{{ $labels.instance }}`}}."
+ description: "Proxy request to {{`{{ $labels.instance }}`}} has p95 latency above 3 seconds for 5 minutes."
+ dashboard_url: "{{ .Values.prometheusRules.params.dashboard_url }}"
+ runbook_url: "{{ .Values.prometheusRules.params.runbook_url }}"
+ expr: quantile_over_time(0.95, probe_duration_seconds{job="nexus-proxy-targets"}[5m]) > 3
+ for: 5m
+ labels:
+ team: sre
+ service: nexus
+ component: proxy
+ environment: production
+ severity: warning
- - alert: NexusProxySSLCertificateExpiringSoon
- expr: probe_ssl_earliest_cert_expiry{job="nexus-blackbox-proxy"} - time() < 86400 * 7
- for: 1h
- labels:
- severity: warning
- component: proxy
- service: blackbox
- annotations:
- summary: "SSL certificate expiring soon for {{`{{ $labels.instance }}`}}"
- description: "SSL certificate for {{`{{ $labels.instance }}`}} will expire in {{`{{ $value | humanizeDuration }}`}}."
+ - alert: NexusProxyCriticalLatency
+ annotations:
+ summary: "Critical Nexus proxy latency for {{`{{ $labels.instance }}`}}."
+ description: "Proxy request to {{`{{ $labels.instance }}`}} has p99 latency above 5 seconds for 3 minutes."
+ dashboard_url: "{{ .Values.prometheusRules.params.dashboard_url }}"
+ runbook_url: "{{ .Values.prometheusRules.params.runbook_url }}"
+ expr: quantile_over_time(0.99, probe_duration_seconds{job="nexus-proxy-targets"}[5m]) > 5
+ for: 3m
+ labels:
+ team: sre
+ service: nexus
+ component: proxy
+ environment: production
+ severity: critical
- - alert: NexusProxySSLCertificateExpired
- expr: probe_ssl_earliest_cert_expiry{job="nexus-blackbox-proxy"} - time() < 0
- for: 1m
- labels:
- severity: critical
- component: proxy
- service: blackbox
- annotations:
- summary: "SSL certificate expired for {{`{{ $labels.instance }}`}}"
- description: "SSL certificate for {{`{{ $labels.instance }}`}} has expired."
-
- - alert: NexusProxyHTTPError
- expr: probe_http_status_code{job="nexus-blackbox-proxy"} >= 400 and probe_http_status_code{job="nexus-blackbox-proxy"} < 600
- for: 1m
- labels:
- severity: warning
- component: proxy
- service: blackbox
- annotations:
- summary: "HTTP error status for {{`{{ $labels.instance }}`}}"
- description: "Proxy request to {{`{{ $labels.instance }}`}} returned HTTP status code {{`{{ $value }}`}}."
-
- - alert: NexusBlackboxExporterDown
- expr: up{job="nexus-blackbox-exporter",namespace="nexus"} == 0
- for: 5m
- labels:
- severity: critical
- component: monitoring
- service: blackbox
- annotations:
- summary: "Blackbox exporter is down"
- description: "Blackbox exporter pod {{`{{ $labels.pod }}`}} in cluster {{`{{ $labels.cluster }}`}} is not responding to scrape requests."
-
- - alert: NexusProxyMultipleTargetsDown
- expr: count(probe_success{job="nexus-blackbox-proxy"} == 0) > 2
- for: 1m
- labels:
- severity: critical
- component: proxy
- service: blackbox
- annotations:
- summary: "Multiple proxy targets are unreachable"
- description: "{{`{{ $value }}`}} targets are currently unreachable through the proxy. This might indicate a proxy outage."
\ No newline at end of file
+ - alert: NexusProxyCertExpiringSoon
+ annotations:
+ summary: "SSL certificate expiring soon for {{`{{ $labels.instance }}`}}."
+ description: "SSL certificate for {{`{{ $labels.instance }}`}} will expire in {{`{{ $value | humanizeDuration }}`}}."
+ dashboard_url: "{{ .Values.prometheusRules.params.dashboard_url }}"
+ runbook_url: "{{ .Values.prometheusRules.params.runbook_url }}"
+ expr: probe_ssl_earliest_cert_expiry{job="nexus-proxy-targets"} - time() < 86400 * 30
+ labels:
+ team: sre
+ service: nexus
+ component: certificates
+ environment: production
+ severity: warning
\ No newline at end of file
diff --git a/deploy/files/alerts/nexus-common.rules b/deploy/files/alerts/nexus-common.rules
index a26e1eb..988ad1b 100644
--- a/deploy/files/alerts/nexus-common.rules
+++ b/deploy/files/alerts/nexus-common.rules
@@ -1,34 +1,195 @@
- name: nexus-common-rules
rules:
- - alert: NexusMemUsageIsCritical
+ - alert: NexusAvailabilityLow
annotations:
- description: "Nexus is using too much memory. Check Nexus"
- summary: "Nexus is using too much memory. Possible system crash."
- expr: sum(container_memory_working_set_bytes{cluster="", namespace="nexus", container="nexus3", image!=""}) by (pod) / 1073741824 >= kube_pod_container_resource_limits{namespace="nex
us", resource="memory", container="nexus3"} / 1073741824
+ summary: "Nexus availability is low."
+ description: "Nexus pod readiness is below 95% for 3 minutes."
+ dashboard_url: "{{ .Values.prometheusRules.params.dashboard_url }}"
+ runbook_url: "{{ .Values.prometheusRules.params.runbook_url }}"
+ expr: avg(avg_over_time(kube_pod_container_status_ready{namespace="nexus",container="nexus3"}[5m])) < 0.95
+ for: 3m
labels:
+ team: sre
+ service: nexus
+ component: availability
+ environment: production
severity: critical
- - alert: NexusMemHeapUsageIsCritical
+
+ - alert: NexusIngress4xxHigh
annotations:
- description: "Nexus JAVA MEM is using too much memory. Check Nexus"
- summary: "Nexus JAVA MEM is using too much memory. Less than 512 MB left"
- expr: (sum(jvm_memory_total_max {namespace="nexus"}) by (pod) / 1073741824) - (sum(jvm_memory_total_used{namespace="nexus", container="nexus3"}) by (pod) / 1073741824) <= 0.512
- for: 10m
+ summary: "Nexus 4xx error ratio is high."
+ description: "Nexus 4xx ratio is above 2% for 3 minutes."
+ dashboard_url: "{{ .Values.prometheusRules.params.dashboard_url }}"
+ runbook_url: "{{ .Values.prometheusRules.params.runbook_url }}"
+ expr: |-
+ sum(rate(org_eclipse_jetty_webapp_WebAppContext_4xx_responses_total{namespace="nexus"}[10m]))
+ /
+ (
+ sum(rate(org_eclipse_jetty_webapp_WebAppContext_2xx_responses_total{namespace="nexus"}[10m]))
+ +
+ sum(rate(org_eclipse_jetty_webapp_WebAppContext_4xx_responses_total{namespace="nexus"}[10m]))
+ +
+ sum(rate(org_eclipse_jetty_webapp_WebAppContext_5xx_responses_total{namespace="nexus"}[10m]))
+ ) > 0.02
+ for: 3m
labels:
+ team: sre
+ service: nexus
+ component: ingress
+ environment: production
severity: warning
- - alert: NexusDown
+
+ - alert: NexusIngress4xxCritical
annotations:
- description: Nexus unavailable for 5 minutes.
- summary: Nexus unavailable for 5 minutes. You need to check the functionality.
- expr: avg(avg_over_time(kube_pod_container_status_ready{namespace="nexus",container="nexus3"}[5m])) < 1
- for: 5m
+ summary: "Nexus 4xx error ratio is critical."
+ description: "Nexus 4xx ratio is above 5% for 3 minutes."
+ dashboard_url: "{{ .Values.prometheusRules.params.dashboard_url }}"
+ runbook_url: "{{ .Values.prometheusRules.params.runbook_url }}"
+ expr: |-
+ sum(rate(org_eclipse_jetty_webapp_WebAppContext_4xx_responses_total{namespace="nexus"}[10m]))
+ /
+ (
+ sum(rate(org_eclipse_jetty_webapp_WebAppContext_2xx_responses_total{namespace="nexus"}[10m]))
+ +
+ sum(rate(org_eclipse_jetty_webapp_WebAppContext_4xx_responses_total{namespace="nexus"}[10m]))
+ +
+ sum(rate(org_eclipse_jetty_webapp_WebAppContext_5xx_responses_total{namespace="nexus"}[10m]))
+ ) > 0.05
+ for: 3m
+ labels:
+ team: sre
+ service: nexus
+ component: ingress
+ environment: production
+ severity: critical
+
+ - alert: NexusIngress5xxHigh
+ annotations:
+ summary: "Nexus 5xx error ratio is high."
+ description: "Nexus 5xx ratio is above 2% for 3 minutes."
+ dashboard_url: "{{ .Values.prometheusRules.params.dashboard_url }}"
+ runbook_url: "{{ .Values.prometheusRules.params.runbook_url }}"
+ expr: |-
+ sum(rate(org_eclipse_jetty_webapp_WebAppContext_5xx_responses_total{namespace="nexus"}[10m]))
+ /
+ (
+ sum(rate(org_eclipse_jetty_webapp_WebAppContext_2xx_responses_total{namespace="nexus"}[10m]))
+ +
+ sum(rate(org_eclipse_jetty_webapp_WebAppContext_4xx_responses_total{namespace="nexus"}[10m]))
+ +
+ sum(rate(org_eclipse_jetty_webapp_WebAppContext_5xx_responses_total{namespace="nexus"}[10m]))
+ ) > 0.02
+ for: 3m
+ labels:
+ team: sre
+ service: nexus
+ component: ingress
+ environment: production
+ severity: warning
+
+ - alert: NexusIngress5xxCritical
+ annotations:
+ summary: "Nexus 5xx error ratio is critical."
+ description: "Nexus 5xx ratio is above 5% for 3 minutes."
+ dashboard_url: "{{ .Values.prometheusRules.params.dashboard_url }}"
+ runbook_url: "{{ .Values.prometheusRules.params.runbook_url }}"
+ expr: |-
+ sum(rate(org_eclipse_jetty_webapp_WebAppContext_5xx_responses_total{namespace="nexus"}[10m]))
+ /
+ (
+ sum(rate(org_eclipse_jetty_webapp_WebAppContext_2xx_responses_total{namespace="nexus"}[10m]))
+ +
+ sum(rate(org_eclipse_jetty_webapp_WebAppContext_4xx_responses_total{namespace="nexus"}[10m]))
+ +
+ sum(rate(org_eclipse_jetty_webapp_WebAppContext_5xx_responses_total{namespace="nexus"}[10m]))
+ ) > 0.05
+ for: 3m
labels:
+ team: sre
+ service: nexus
+ component: ingress
+ environment: production
severity: critical
- - alert: NexusErrorMessages
+
+ - alert: NexusRegistryLatencyHigh
annotations:
- description: Nexus has error messages for the last 5m.
- logs_url: {{ .Values.prometheusRules.params.logs_url }}
- summary: Nexus has error messages for the last 5m. You need to check the logs.
- expr: rate(metrics_error_total{namespace="nexus",container="nexus3"}[5m]) > 10
+ summary: "Nexus repository read latency is high."
+ description: "Nexus repository read p99 latency is above 3 seconds for 5 minutes."
+ dashboard_url: "{{ .Values.prometheusRules.params.dashboard_url }}"
+ runbook_url: "{{ .Values.prometheusRules.params.runbook_url }}"
+ expr: org_sonatype_nexus_coreui_RepositoryComponent_read_timer{quantile="0.99",namespace="nexus"} > 3
for: 5m
labels:
+ team: sre
+ service: nexus
+ component: registry
+ environment: production
severity: warning
+
+ - alert: NexusRegistryLatencyCritical
+ annotations:
+ summary: "Nexus repository read latency is critical."
+ description: "Nexus repository read p99 latency is above 5 seconds for 3 minutes."
+ dashboard_url: "{{ .Values.prometheusRules.params.dashboard_url }}"
+ runbook_url: "{{ .Values.prometheusRules.params.runbook_url }}"
+ expr: org_sonatype_nexus_coreui_RepositoryComponent_read_timer{quantile="0.99",namespace="nexus"} > 5
+ for: 3m
+ labels:
+ team: sre
+ service: nexus
+ component: registry
+ environment: production
+ severity: critical
+
+ - alert: NexusBlobstoreUnavailable
+ annotations:
+ summary: "Nexus blobstore is unavailable."
+ description: "Nexus default blobstore has no available space."
+ dashboard_url: "{{ .Values.prometheusRules.params.dashboard_url }}"
+ runbook_url: "{{ .Values.prometheusRules.params.runbook_url }}"
+ expr: nexus_blobstores_stats_availableSpaceInBytes{namespace="nexus", name="default"} == 0
+ for: 1m
+ labels:
+ team: sre
+ service: nexus
+ component: blobstore
+ environment: production
+ severity: critical
+
+ - alert: NexusJvmHeapHigh
+ annotations:
+ summary: "Nexus JVM heap usage is high."
+ description: "Nexus JVM heap usage is above 90% for 3 minutes."
+ dashboard_url: "{{ .Values.prometheusRules.params.dashboard_url }}"
+ runbook_url: "{{ .Values.prometheusRules.params.runbook_url }}"
+ expr: |-
+ sum(jvm_memory_heap_used{namespace="nexus", container="nexus3"}) by(pod)
+ /
+ sum(jvm_memory_heap_max{namespace="nexus", container="nexus3"}) by(pod)
+ > 0.9
+ for: 3m
+ labels:
+ team: sre
+ service: nexus
+ component: jvm
+ environment: production
+ severity: critical
+
+ - alert: NexusCpuThrottlingHigh
+ annotations:
+ summary: "Nexus CPU throttling is high."
+ description: "Nexus CPU throttling ratio is above 20% for 5 minutes."
+ dashboard_url: "{{ .Values.prometheusRules.params.dashboard_url }}"
+ runbook_url: "{{ .Values.prometheusRules.params.runbook_url }}"
+ expr: |-
+ rate(container_cpu_cfs_throttled_periods_total{namespace="nexus",container="nexus3"}[5m])
+ /
+ rate(container_cpu_cfs_periods_total{namespace="nexus",container="nexus3"}[5m])
+ > 0.2
+ for: 5m
+ labels:
+ team: sre
+ service: nexus
+ component: cpu
+ environment: production
+ severity: warning