Загрузка данных


dmitriev-aal@VDI-Dmitriev-A:~/Desktop/appfarm/infra/k8s/nexus$ git diff origin/master -- \
> deploy/files/alerts/nexus-common.rules \
> deploy/files/alerts/nexus-alerts.rules \
> deploy/files/alerts/nexus-blackbox-proxy.rules


index a03eabb..96cf831 100644
--- a/deploy/files/alerts/nexus-alerts.rules
+++ b/deploy/files/alerts/nexus-alerts.rules
@@ -2,9 +2,10 @@
   rules:
   - alert: CriticalNexusOrientDBFillingUP
     annotations:
-      description: The PersistentVolume claimed by {{`{{ $labels.persistentvolumeclaim }}`}} in Namespace {{`{{ $labels.namespace }}`}} is only {{`{{ $value | humanizePercentage }}`}} free.
-      runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup
-      summary: PersistentVolume is filling up.
+      description: "The PersistentVolume claimed by {{`{{ $labels.persistentvolumeclaim }}`}} in Namespace {{`{{ $labels.namespace }}`}} is only {{`{{ $value | humanizePercentage }}`}} free."
+      runbook_url: "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup"
+      summary: "PersistentVolume is filling up."
+      dashboard_url: "{{ .Values.prometheusRules.params.dashboard_url }}"
     expr: |-
       kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics", persistentvolumeclaim="nexus-nexus3-data"}
         /
@@ -12,31 +13,54 @@
         < 0.03
     for: 1m
     labels:
+      team: sre
+      service: nexus
+      component: storage
+      environment: production
       severity: critical
+
   - alert: WarningNexusOrientDBFillingUP
     annotations:
-      description: Based on recent sampling, the PersistentVolume claimed by {{`{{ $labels.persistentvolumeclaim }}`}} in Namespace {{`{{ $labels.namespace }}`}} is expected to fill up wi
thin four days. Currently {{`{{ $value | humanizePercentage }}`}} is available.
-      runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup
-      summary: PersistentVolume is filling up.
+      description: "Based on recent sampling, the PersistentVolume claimed by {{`{{ $labels.persistentvolumeclaim }}`}} in Namespace {{`{{ $labels.namespace }}`}} is expected to fill up w
ithin four days. Currently {{`{{ $value | humanizePercentage }}`}} is available."
+      runbook_url: "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup"
+      summary: "PersistentVolume is filling up."
+      dashboard_url: "{{ .Values.prometheusRules.params.dashboard_url }}"
     expr: |-
-        kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics", persistentvolumeclaim="nexus-nexus3-data"} /1024/1024/1024 < 6
+      kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics", persistentvolumeclaim="nexus-nexus3-data"} /1024/1024/1024 < 6
     for: 1h
     labels:
+      team: sre
+      service: nexus
+      component: storage
+      environment: production
       severity: warning
+
   - alert: CriticalNexusOrientDBFillingErrors
     annotations:
-      description: The persistent volume {{`{{ $labels.persistentvolume }}`}} has status {{`{{ $labels.phase }}`}}.
-      runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors
-      summary: PersistentVolume is having issues with provisioning.
+      description: "The persistent volume {{`{{ $labels.persistentvolume }}`}} has status {{`{{ $labels.phase }}`}}."
+      runbook_url: "https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors"
+      summary: "PersistentVolume is having issues with provisioning."
+      dashboard_url: "{{ .Values.prometheusRules.params.dashboard_url }}"
     expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics", persistentvolumeclaim="nexus-nexus3-data"} > 0
     for: 5m
     labels:
+      team: sre
+      service: nexus
+      component: storage
+      environment: production
       severity: critical
-  - alert: NexusTaskNotOK
+
+  - alert: NexusTaskFailures
     annotations:
-      description: Task with name {{`{{ $labels.name }}`}} was completed with is NOT OK status.
-      summary: Check Nexus UI (System -> Tasks) and logs for investigating.
-    expr: nexus_tasks_status{lastRunResult!~"OK|<nil>"} > 0
+      description: "Task with name {{`{{ $labels.name }}`}} was completed with NOT OK status {{`{{ $labels.lastRunResult }}`}}."
+      summary: "Check Nexus UI (System -> Tasks) and logs for investigating."
+      dashboard_url: "{{ .Values.prometheusRules.params.dashboard_url }}"
+      runbook_url: "{{ .Values.prometheusRules.params.runbook_url }}"
+    expr: nexus_tasks_status{lastRunResult!~"OK|<nil>"}
     for: 1m
     labels:
-      severity: critical
+      team: sre
+      service: nexus
+      component: tasks
+      environment: production
+      severity: critical
\ No newline at end of file
diff --git a/deploy/files/alerts/nexus-blackbox-proxy.rules b/deploy/files/alerts/nexus-blackbox-proxy.rules
index 80f5744..c0ab151 100644
--- a/deploy/files/alerts/nexus-blackbox-proxy.rules
+++ b/deploy/files/alerts/nexus-blackbox-proxy.rules
@@ -1,78 +1,60 @@
 - name: nexus-blackbox-proxy
   rules:
-    - alert: NexusProxyDown
-      expr: probe_success{job="nexus-blackbox-proxy"} == 0
-      for: 1m
-      labels:
-        severity: critical
-        component: proxy
-        service: blackbox
-      annotations:
-        summary: "Proxy connection failed for {{`{{ $labels.instance }}`}}"
-        description: "Blackbox exporter failed to connect to {{`{{ $labels.instance }}`}} through proxy. The proxy might be down or the target is unreachable."
+  - alert: NexusProxyTargetDown
+    annotations:
+      summary: "Proxy target {{`{{ $labels.instance }}`}} is down."
+      description: "Blackbox probe through Nexus proxy failed for {{`{{ $labels.instance }}`}} for 3 minutes."
+      dashboard_url: "{{ .Values.prometheusRules.params.dashboard_url }}"
+      runbook_url: "{{ .Values.prometheusRules.params.runbook_url }}"
+    expr: probe_success{job="nexus-proxy-targets"} == 0
+    for: 3m
+    labels:
+      team: sre
+      service: nexus
+      component: proxy
+      environment: production
+      severity: critical
 
-    - alert: NexusProxyHighLatency
-      expr: probe_duration_seconds{job="nexus-blackbox-proxy"} > 5
-      for: 5m
-      labels:
-        severity: warning
-        component: proxy
-        service: blackbox
-      annotations:
-        summary: "High latency detected for {{`{{ $labels.instance }}`}}"
-        description: "Proxy request to {{`{{ $labels.instance }}`}} took {{`{{ $value }}`}}s (threshold: 5s). This might indicate network issues or proxy performance problems."
+  - alert: NexusProxyHighLatency
+    annotations:
+      summary: "High Nexus proxy latency for {{`{{ $labels.instance }}`}}."
+      description: "Proxy request to {{`{{ $labels.instance }}`}} has p95 latency above 3 seconds for 5 minutes."
+      dashboard_url: "{{ .Values.prometheusRules.params.dashboard_url }}"
+      runbook_url: "{{ .Values.prometheusRules.params.runbook_url }}"
+    expr: quantile_over_time(0.95, probe_duration_seconds{job="nexus-proxy-targets"}[5m]) > 3
+    for: 5m
+    labels:
+      team: sre
+      service: nexus
+      component: proxy
+      environment: production
+      severity: warning
 
-    - alert: NexusProxySSLCertificateExpiringSoon
-      expr: probe_ssl_earliest_cert_expiry{job="nexus-blackbox-proxy"} - time() < 86400 * 7
-      for: 1h
-      labels:
-        severity: warning
-        component: proxy
-        service: blackbox
-      annotations:
-        summary: "SSL certificate expiring soon for {{`{{ $labels.instance }}`}}"
-        description: "SSL certificate for {{`{{ $labels.instance }}`}} will expire in {{`{{ $value | humanizeDuration }}`}}."
+  - alert: NexusProxyCriticalLatency
+    annotations:
+      summary: "Critical Nexus proxy latency for {{`{{ $labels.instance }}`}}."
+      description: "Proxy request to {{`{{ $labels.instance }}`}} has p99 latency above 5 seconds for 3 minutes."
+      dashboard_url: "{{ .Values.prometheusRules.params.dashboard_url }}"
+      runbook_url: "{{ .Values.prometheusRules.params.runbook_url }}"
+    expr: quantile_over_time(0.99, probe_duration_seconds{job="nexus-proxy-targets"}[5m]) > 5
+    for: 3m
+    labels:
+      team: sre
+      service: nexus
+      component: proxy
+      environment: production
+      severity: critical
 
-    - alert: NexusProxySSLCertificateExpired
-      expr: probe_ssl_earliest_cert_expiry{job="nexus-blackbox-proxy"} - time() < 0
-      for: 1m
-      labels:
-        severity: critical
-        component: proxy
-        service: blackbox
-      annotations:
-        summary: "SSL certificate expired for {{`{{ $labels.instance }}`}}"
-        description: "SSL certificate for {{`{{ $labels.instance }}`}} has expired."
-
-    - alert: NexusProxyHTTPError
-      expr: probe_http_status_code{job="nexus-blackbox-proxy"} >= 400 and probe_http_status_code{job="nexus-blackbox-proxy"} < 600
-      for: 1m
-      labels:
-        severity: warning
-        component: proxy
-        service: blackbox
-      annotations:
-        summary: "HTTP error status for {{`{{ $labels.instance }}`}}"
-        description: "Proxy request to {{`{{ $labels.instance }}`}} returned HTTP status code {{`{{ $value }}`}}."
-
-    - alert: NexusBlackboxExporterDown
-      expr: up{job="nexus-blackbox-exporter",namespace="nexus"} == 0
-      for: 5m
-      labels:
-        severity: critical
-        component: monitoring
-        service: blackbox
-      annotations:
-        summary: "Blackbox exporter is down"
-        description: "Blackbox exporter pod {{`{{ $labels.pod }}`}} in cluster {{`{{ $labels.cluster }}`}} is not responding to scrape requests."
-
-    - alert: NexusProxyMultipleTargetsDown
-      expr: count(probe_success{job="nexus-blackbox-proxy"} == 0) > 2
-      for: 1m
-      labels:
-        severity: critical
-        component: proxy
-        service: blackbox
-      annotations:
-        summary: "Multiple proxy targets are unreachable"
-        description: "{{`{{ $value }}`}} targets are currently unreachable through the proxy. This might indicate a proxy outage."
\ No newline at end of file
+  - alert: NexusProxyCertExpiringSoon
+    annotations:
+      summary: "SSL certificate expiring soon for {{`{{ $labels.instance }}`}}."
+      description: "SSL certificate for {{`{{ $labels.instance }}`}} will expire in {{`{{ $value | humanizeDuration }}`}}."
+      dashboard_url: "{{ .Values.prometheusRules.params.dashboard_url }}"
+      runbook_url: "{{ .Values.prometheusRules.params.runbook_url }}"
+    expr: probe_ssl_earliest_cert_expiry{job="nexus-proxy-targets"} - time() < 86400 * 30
+    labels:
+      team: sre
+      service: nexus
+      component: certificates
+      environment: production
+      severity: warning
\ No newline at end of file
diff --git a/deploy/files/alerts/nexus-common.rules b/deploy/files/alerts/nexus-common.rules
index a26e1eb..988ad1b 100644
--- a/deploy/files/alerts/nexus-common.rules
+++ b/deploy/files/alerts/nexus-common.rules
@@ -1,34 +1,195 @@
 - name: nexus-common-rules
   rules:
-  - alert: NexusMemUsageIsCritical
+  - alert: NexusAvailabilityLow
     annotations:
-      description: "Nexus is using too much memory. Check Nexus"
-      summary: "Nexus is using too much memory. Possible system crash."
-    expr: sum(container_memory_working_set_bytes{cluster="", namespace="nexus", container="nexus3", image!=""}) by (pod) / 1073741824  >= kube_pod_container_resource_limits{namespace="nex
us", resource="memory", container="nexus3"} / 1073741824
+      summary: "Nexus availability is low."
+      description: "Nexus pod readiness is below 95% for 3 minutes."
+      dashboard_url: "{{ .Values.prometheusRules.params.dashboard_url }}"
+      runbook_url: "{{ .Values.prometheusRules.params.runbook_url }}"
+    expr: avg(avg_over_time(kube_pod_container_status_ready{namespace="nexus",container="nexus3"}[5m])) < 0.95
+    for: 3m
     labels:
+      team: sre
+      service: nexus
+      component: availability
+      environment: production
       severity: critical
-  - alert: NexusMemHeapUsageIsCritical
+
+  - alert: NexusIngress4xxHigh
     annotations:
-      description: "Nexus JAVA MEM is using too much memory. Check Nexus"
-      summary: "Nexus JAVA MEM is using too much memory. Less than 512 MB left"
-    expr: (sum(jvm_memory_total_max {namespace="nexus"}) by (pod) / 1073741824) - (sum(jvm_memory_total_used{namespace="nexus", container="nexus3"}) by (pod) / 1073741824) <= 0.512
-    for: 10m
+      summary: "Nexus 4xx error ratio is high."
+      description: "Nexus 4xx ratio is above 2% for 3 minutes."
+      dashboard_url: "{{ .Values.prometheusRules.params.dashboard_url }}"
+      runbook_url: "{{ .Values.prometheusRules.params.runbook_url }}"
+    expr: |-
+      sum(rate(org_eclipse_jetty_webapp_WebAppContext_4xx_responses_total{namespace="nexus"}[10m]))
+        /
+      (
+        sum(rate(org_eclipse_jetty_webapp_WebAppContext_2xx_responses_total{namespace="nexus"}[10m]))
+        +
+        sum(rate(org_eclipse_jetty_webapp_WebAppContext_4xx_responses_total{namespace="nexus"}[10m]))
+        +
+        sum(rate(org_eclipse_jetty_webapp_WebAppContext_5xx_responses_total{namespace="nexus"}[10m]))
+      ) > 0.02
+    for: 3m
     labels:
+      team: sre
+      service: nexus
+      component: ingress
+      environment: production
       severity: warning
-  - alert: NexusDown
+
+  - alert: NexusIngress4xxCritical
     annotations:
-      description: Nexus unavailable for 5 minutes.
-      summary: Nexus unavailable for 5 minutes. You need to check the functionality.
-    expr: avg(avg_over_time(kube_pod_container_status_ready{namespace="nexus",container="nexus3"}[5m])) < 1
-    for: 5m
+      summary: "Nexus 4xx error ratio is critical."
+      description: "Nexus 4xx ratio is above 5% for 3 minutes."
+      dashboard_url: "{{ .Values.prometheusRules.params.dashboard_url }}"
+      runbook_url: "{{ .Values.prometheusRules.params.runbook_url }}"
+    expr: |-
+      sum(rate(org_eclipse_jetty_webapp_WebAppContext_4xx_responses_total{namespace="nexus"}[10m]))
+        /
+      (
+        sum(rate(org_eclipse_jetty_webapp_WebAppContext_2xx_responses_total{namespace="nexus"}[10m]))
+        +
+        sum(rate(org_eclipse_jetty_webapp_WebAppContext_4xx_responses_total{namespace="nexus"}[10m]))
+        +
+        sum(rate(org_eclipse_jetty_webapp_WebAppContext_5xx_responses_total{namespace="nexus"}[10m]))
+      ) > 0.05
+    for: 3m
+    labels:
+      team: sre
+      service: nexus
+      component: ingress
+      environment: production
+      severity: critical
+
+  - alert: NexusIngress5xxHigh
+    annotations:
+      summary: "Nexus 5xx error ratio is high."
+      description: "Nexus 5xx ratio is above 2% for 3 minutes."
+      dashboard_url: "{{ .Values.prometheusRules.params.dashboard_url }}"
+      runbook_url: "{{ .Values.prometheusRules.params.runbook_url }}"
+    expr: |-
+      sum(rate(org_eclipse_jetty_webapp_WebAppContext_5xx_responses_total{namespace="nexus"}[10m]))
+        /
+      (
+        sum(rate(org_eclipse_jetty_webapp_WebAppContext_2xx_responses_total{namespace="nexus"}[10m]))
+        +
+        sum(rate(org_eclipse_jetty_webapp_WebAppContext_4xx_responses_total{namespace="nexus"}[10m]))
+        +
+        sum(rate(org_eclipse_jetty_webapp_WebAppContext_5xx_responses_total{namespace="nexus"}[10m]))
+      ) > 0.02
+    for: 3m
+    labels:
+      team: sre
+      service: nexus
+      component: ingress
+      environment: production
+      severity: warning
+
+  - alert: NexusIngress5xxCritical
+    annotations:
+      summary: "Nexus 5xx error ratio is critical."
+      description: "Nexus 5xx ratio is above 5% for 3 minutes."
+      dashboard_url: "{{ .Values.prometheusRules.params.dashboard_url }}"
+      runbook_url: "{{ .Values.prometheusRules.params.runbook_url }}"
+    expr: |-
+      sum(rate(org_eclipse_jetty_webapp_WebAppContext_5xx_responses_total{namespace="nexus"}[10m]))
+        /
+      (
+        sum(rate(org_eclipse_jetty_webapp_WebAppContext_2xx_responses_total{namespace="nexus"}[10m]))
+        +
+        sum(rate(org_eclipse_jetty_webapp_WebAppContext_4xx_responses_total{namespace="nexus"}[10m]))
+        +
+        sum(rate(org_eclipse_jetty_webapp_WebAppContext_5xx_responses_total{namespace="nexus"}[10m]))
+      ) > 0.05
+    for: 3m
     labels:
+      team: sre
+      service: nexus
+      component: ingress
+      environment: production
       severity: critical
-  - alert: NexusErrorMessages
+
+  - alert: NexusRegistryLatencyHigh
     annotations:
-      description: Nexus has error messages for the last 5m.
-      logs_url: {{ .Values.prometheusRules.params.logs_url }}
-      summary: Nexus has error messages for the last 5m. You need to check the logs.
-    expr: rate(metrics_error_total{namespace="nexus",container="nexus3"}[5m]) > 10
+      summary: "Nexus repository read latency is high."
+      description: "Nexus repository read p99 latency is above 3 seconds for 5 minutes."
+      dashboard_url: "{{ .Values.prometheusRules.params.dashboard_url }}"
+      runbook_url: "{{ .Values.prometheusRules.params.runbook_url }}"
+    expr: org_sonatype_nexus_coreui_RepositoryComponent_read_timer{quantile="0.99",namespace="nexus"} > 3
     for: 5m
     labels:
+      team: sre
+      service: nexus
+      component: registry
+      environment: production
       severity: warning
+
+  - alert: NexusRegistryLatencyCritical
+    annotations:
+      summary: "Nexus repository read latency is critical."
+      description: "Nexus repository read p99 latency is above 5 seconds for 3 minutes."
+      dashboard_url: "{{ .Values.prometheusRules.params.dashboard_url }}"
+      runbook_url: "{{ .Values.prometheusRules.params.runbook_url }}"
+    expr: org_sonatype_nexus_coreui_RepositoryComponent_read_timer{quantile="0.99",namespace="nexus"} > 5
+    for: 3m
+    labels:
+      team: sre
+      service: nexus
+      component: registry
+      environment: production
+      severity: critical
+
+  - alert: NexusBlobstoreUnavailable
+    annotations:
+      summary: "Nexus blobstore is unavailable."
+      description: "Nexus default blobstore has no available space."
+      dashboard_url: "{{ .Values.prometheusRules.params.dashboard_url }}"
+      runbook_url: "{{ .Values.prometheusRules.params.runbook_url }}"
+    expr: nexus_blobstores_stats_availableSpaceInBytes{namespace="nexus", name="default"} == 0
+    for: 1m
+    labels:
+      team: sre
+      service: nexus
+      component: blobstore
+      environment: production
+      severity: critical
+
+  - alert: NexusJvmHeapHigh
+    annotations:
+      summary: "Nexus JVM heap usage is high."
+      description: "Nexus JVM heap usage is above 90% for 3 minutes."
+      dashboard_url: "{{ .Values.prometheusRules.params.dashboard_url }}"
+      runbook_url: "{{ .Values.prometheusRules.params.runbook_url }}"
+    expr: |-
+      sum(jvm_memory_heap_used{namespace="nexus", container="nexus3"}) by(pod)
+        /
+      sum(jvm_memory_heap_max{namespace="nexus", container="nexus3"}) by(pod)
+        > 0.9
+    for: 3m
+    labels:
+      team: sre
+      service: nexus
+      component: jvm
+      environment: production
+      severity: critical
+
+  - alert: NexusCpuThrottlingHigh
+    annotations:
+      summary: "Nexus CPU throttling is high."
+      description: "Nexus CPU throttling ratio is above 20% for 5 minutes."
+      dashboard_url: "{{ .Values.prometheusRules.params.dashboard_url }}"
+      runbook_url: "{{ .Values.prometheusRules.params.runbook_url }}"
+    expr: |-
+      rate(container_cpu_cfs_throttled_periods_total{namespace="nexus",container="nexus3"}[5m])
+        /
+      rate(container_cpu_cfs_periods_total{namespace="nexus",container="nexus3"}[5m])
+        > 0.2
+    for: 5m
+    labels:
+      team: sre
+      service: nexus
+      component: cpu
+      environment: production
+      severity: warning