From 938a199cea46f967baf2ce6a946659075417b255 Mon Sep 17 00:00:00 2001 From: Gerard Vanloo Date: Tue, 12 Aug 2025 19:31:42 -0400 Subject: [PATCH 01/35] chore: lint tools role Signed-off-by: Gerard Vanloo --- sre/roles/tools/tasks/install.yaml | 4 ++-- sre/roles/tools/tasks/reinit.yaml | 6 +++--- sre/roles/tools/tasks/uninstall_opentelemetry.yaml | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/sre/roles/tools/tasks/install.yaml b/sre/roles/tools/tasks/install.yaml index a5e5c46d..c50aca57 100644 --- a/sre/roles/tools/tasks/install.yaml +++ b/sre/roles/tools/tasks/install.yaml @@ -16,8 +16,8 @@ cmd: kubectl certificate approve {{ item.metadata.name }} environment: KUBECONFIG: "{{ tools_cluster.kubeconfig | ansible.builtin.expanduser }}" - register: cert_approve_output - changed_when: cert_approve_output.rc == 0 + register: tools_certs_approve_output + changed_when: tools_certs_approve_output.rc == 0 loop: "{{ tools_csr_info.resources }}" loop_control: label: "csr/{{ item.metadata.name }}" diff --git a/sre/roles/tools/tasks/reinit.yaml b/sre/roles/tools/tasks/reinit.yaml index 2505cb4a..d2abc91a 100644 --- a/sre/roles/tools/tasks/reinit.yaml +++ b/sre/roles/tools/tasks/reinit.yaml @@ -65,8 +65,8 @@ cmd: kubectl delete events --all-namespaces --all --ignore-not-found=true environment: KUBECONFIG: "{{ tools_cluster.kubeconfig }}" - register: delete_events_result - changed_when: delete_events_result.rc == 0 - failed_when: delete_events_result.rc not in [0, 1] + register: tools_delete_events_result + changed_when: tools_delete_events_result.rc == 0 + failed_when: tools_delete_events_result.rc not in [0, 1] when: - tools_cluster.platform == "kubernetes" diff --git a/sre/roles/tools/tasks/uninstall_opentelemetry.yaml b/sre/roles/tools/tasks/uninstall_opentelemetry.yaml index 94610a5a..6317a687 100644 --- a/sre/roles/tools/tasks/uninstall_opentelemetry.yaml +++ b/sre/roles/tools/tasks/uninstall_opentelemetry.yaml @@ -15,7 +15,7 @@ kind: OpenTelemetryCollector kubeconfig: "{{ tools_cluster.kubeconfig }}" namespace: "{{ helm_releases.collectors.namespace }}" - register: opentelemetry_collector_info + register: tools_opentelemetry_collector_info when: - tools_required.jaeger @@ -30,7 +30,7 @@ wait: true when: - tools_required.jaeger - - opentelemetry_collector_info.resources | length > 0 + - tools_opentelemetry_collector_info.resources | length > 0 - name: Uninstall OpenTelemetry Operator kubernetes.core.helm: From 00dec1d235b98962ea9234858e147c1f7121df46 Mon Sep 17 00:00:00 2001 From: Gerard Vanloo Date: Wed, 13 Aug 2025 11:29:53 -0400 Subject: [PATCH 02/35] feat: replace AWX alert cronjob with recorders roles Signed-off-by: Gerard Vanloo --- .github/dependabot.yml | 2 + sre/Makefile | 4 +- sre/cron_jobs/alert_recorder.yaml | 59 ------------- sre/cron_jobs/alert_recorder_deinit.yaml | 44 ---------- sre/cron_jobs/alert_recorder_init.yaml | 37 -------- sre/playbooks/manage_incidents.yaml | 32 +++---- sre/roles/awx/tasks/configure_awx.yaml | 17 ---- sre/roles/awx/tasks/configure_jobs.yaml | 54 ------------ sre/roles/awx/tasks/configure_workflows.yaml | 29 ------ .../recorders/defaults/main/namespace.yaml | 3 + .../files/kubernetes/alerts/prometheus.yaml | 71 +++++++++++++++ .../files/scripts/alerts/prometheus/gather.py | 64 ++++++++++++++ .../alerts/prometheus/requirements.txt | 1 + sre/roles/recorders/meta/argument_specs.yaml | 53 +++++++++++ sre/roles/recorders/tasks/install.yaml | 16 ++++ .../tasks/install_alerts_recorders.yaml | 6 ++ .../install_alerts_recorders_prometheus.yaml | 88 +++++++++++++++++++ sre/roles/recorders/tasks/main.yaml | 12 +++ sre/roles/recorders/tasks/uninstall.yaml | 17 ++++ .../tasks/uninstall_alerts_recorders.yaml | 6 ++ ...uninstall_alerts_recorders_prometheus.yaml | 70 +++++++++++++++ sre/roles/tools/meta/argument_specs.yaml | 23 +++++ sre/roles/tools/tasks/install_opencost.yaml | 33 ++----- .../tools/tasks/set_prometheus_endpoint.yaml | 53 +++++++++++ 24 files changed, 504 insertions(+), 290 deletions(-) delete mode 100644 sre/cron_jobs/alert_recorder.yaml delete mode 100644 sre/cron_jobs/alert_recorder_deinit.yaml delete mode 100644 sre/cron_jobs/alert_recorder_init.yaml create mode 100644 sre/roles/recorders/defaults/main/namespace.yaml create mode 100644 sre/roles/recorders/files/kubernetes/alerts/prometheus.yaml create mode 100644 sre/roles/recorders/files/scripts/alerts/prometheus/gather.py create mode 100644 sre/roles/recorders/files/scripts/alerts/prometheus/requirements.txt create mode 100644 sre/roles/recorders/meta/argument_specs.yaml create mode 100644 sre/roles/recorders/tasks/install.yaml create mode 100644 sre/roles/recorders/tasks/install_alerts_recorders.yaml create mode 100644 sre/roles/recorders/tasks/install_alerts_recorders_prometheus.yaml create mode 100644 sre/roles/recorders/tasks/main.yaml create mode 100644 sre/roles/recorders/tasks/uninstall.yaml create mode 100644 sre/roles/recorders/tasks/uninstall_alerts_recorders.yaml create mode 100644 sre/roles/recorders/tasks/uninstall_alerts_recorders_prometheus.yaml create mode 100644 sre/roles/tools/tasks/set_prometheus_endpoint.yaml diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 19dbe3a8..4f12279e 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -20,6 +20,7 @@ updates: - package-ecosystem: "docker" directories: + - "sre/roles/recorders/files/kubernetes/" - "sre/roles/tools/files/kubernetes/" - "sre/tools/kubernetes-topology-monitor/" - "sre/tools/kubernetes-topology-monitor/charts/kubernetes-topology-monitor/templates/" @@ -44,6 +45,7 @@ updates: - "/" - "sre/" - "sre/dev/remote_cluster/" + - "sre/roles/recorders/files/scripts/**/" - "sre/tools/kubernetes-topology-monitor/" groups: pip-production-dependencies: diff --git a/sre/Makefile b/sre/Makefile index 82d0a071..2de7e7fb 100644 --- a/sre/Makefile +++ b/sre/Makefile @@ -65,7 +65,7 @@ endif .PHONY: inject_incident_fault inject_incident_fault: ## Injects the fault used in a specific incident ifdef INCIDENT_NUMBER - ansible-playbook -i inventory.yaml playbooks/manage_incidents.yaml --tags "inject_faults" \ + ansible-playbook -i inventory.yaml playbooks/manage_incidents.yaml --tags "inject_faults,install_recorders" \ --extra-vars "incident_id=$(INCIDENT_NUMBER)" else @echo "Missing INCIDENT_NUMBER argument. Please run this command with this variable." @@ -74,7 +74,7 @@ endif .PHONY: remove_incident_fault remove_incident_fault: ## Removes the fault used in a specific incident ifdef INCIDENT_NUMBER - ansible-playbook -i inventory.yaml playbooks/manage_incidents.yaml --tags "remove_faults" \ + ansible-playbook -i inventory.yaml playbooks/manage_incidents.yaml --tags "remove_faults,uninstall_recorders" \ --extra-vars "incident_id=$(INCIDENT_NUMBER)" else @echo "Missing INCIDENT_NUMBER argument. Please run this command with this variable." diff --git a/sre/cron_jobs/alert_recorder.yaml b/sre/cron_jobs/alert_recorder.yaml deleted file mode 100644 index 5e16ccc2..00000000 --- a/sre/cron_jobs/alert_recorder.yaml +++ /dev/null @@ -1,59 +0,0 @@ ---- -- name: Periodic run for alert recordings - hosts: localhost - tasks: - - name: (Hack) Kubeconfig path inside AWX-EE container - ansible.builtin.shell: find /runner/env -type f -size +3072c - register: kubeconfig - - - name: Tasks associated with leveraging ingress - ansible.builtin.include_tasks: - file: leverage_ingress.yaml - - - name: Tasks associated with leveraging port forwarding - ansible.builtin.include_tasks: - file: leverage_port_forwarding.yaml - - - name: Initialize an empty list for alerts - set_fact: - filtered_alerts: [] - - - name: Call the alerts API - ansible.builtin.uri: - url: "{{ prometheus_url }}/api/v1/alerts" - method: GET - return_content: yes - body_format: json - headers: - Content-Type: "application/json" - register: alerts_api_result - retries: 10 - delay: 5 - until: alerts_api_result.status == 200 - - - name: Parse JSON response and filter for alerts in state alerting - set_fact: - json_data: "{{ alerts_api_result.json.data.alerts | list }}" - - - name: Create temporary file - ansible.builtin.tempfile: - state: file - suffix: temp - register: tempfile_for_alerts_in_alerting_state - - - name: Debug json_data - debug: - var: json_data - - - name: Copy alerts in JSON to temp file - ansible.builtin.copy: - content: "{{ json_data | to_json }}" - dest: "{{ tempfile_for_alerts_in_alerting_state.path }}" - - - name: Upload Alerts JSON to S3 - amazon.aws.s3_object: - endpoint_url: "{{ s3_endpoint_url }}" - bucket: "{{ s3_bucket_name_for_results }}" - object: "/{{ sre_agent_name__version_number }}/{{run_uuid}}/{{scenario_number}}/{{run_number}}/alerts/alerts_at_{{now(utc=true,fmt='%Y-%m-%dT%H:%M:%S.%f')}}.txt" - src: "{{ tempfile_for_alerts_in_alerting_state.path }}" - mode: put diff --git a/sre/cron_jobs/alert_recorder_deinit.yaml b/sre/cron_jobs/alert_recorder_deinit.yaml deleted file mode 100644 index db0c5b4e..00000000 --- a/sre/cron_jobs/alert_recorder_deinit.yaml +++ /dev/null @@ -1,44 +0,0 @@ ---- -- name: (Playbook) Remove alert recorder every minute - hosts: localhost - tasks: - - name: (Task) Remove alert recorder schedule which runs every minute - awx.awx.schedule: - controller_host: "{{ controller_host }}" - name: "Schedule for Alert Recorder - {{ scenario_number }}" - state: absent - unified_job_template: "Scenario-{{ scenario_number }}--Setup-Alert-Recorder" - - - name: Get running jobs for the template - awx.awx.job_list: - controller_host: "{{ controller_host }}" - query: - job_template__name: "Scenario-{{scenario_number}}--Setup-Alert-Recorder" - status: running - register: running_alert_recorder_jobs - - - name: Cancel running jobs - awx.awx.job_cancel: - controller_host: "{{ controller_host }}" - job_id: "{{ item.id }}" - loop: "{{ running_alert_recorder_jobs.results }}" - when: running_alert_recorder_jobs.results | length > 0 - - - name: Wait briefly for jobs to cancel - pause: - seconds: 60 - when: running_alert_recorder_jobs.results | length > 0 - - - name: Delete alert recorder job template - awx.awx.job_template: - controller_host: "{{ controller_host }}" - name: "Scenario-{{scenario_number}}--Setup-Alert-Recorder" - job_type: "run" - organization: "Default" - inventory: "Demo Inventory" - state: absent - register: delete_attempt_jt_alert_recorder_job - until: delete_attempt_jt_alert_recorder_job is succeeded - retries: 3 - delay: 30 - ignore_errors: yes diff --git a/sre/cron_jobs/alert_recorder_init.yaml b/sre/cron_jobs/alert_recorder_init.yaml deleted file mode 100644 index 959f5b66..00000000 --- a/sre/cron_jobs/alert_recorder_init.yaml +++ /dev/null @@ -1,37 +0,0 @@ ---- -- name: (Playbook) Initialize alert recorder every minute - hosts: localhost - tasks: - - name: Create Alert recorder - awx.awx.job_template: - controller_host: "{{ controller_host }}" - name: "Scenario-{{ scenario_number }}--Setup-Alert-Recorder" - job_type: "run" - project: "Project-GitHub-IT-Automation-Bench" - playbook: "sre/cron_jobs/alert_recorder.yaml" - inventory: "Demo Inventory" - organization: "Default" - execution_environment: "AWX-EE-Custom" - credentials: - - "Credential-AWS" - - "Credential-AWX" - - "Credential-Kubeconfig-Scenario-{{scenario_number}}" - state: present - extra_vars: - prometheus_namespace_project_name: prometheus - sre_agent_name__version_number: "{{ sre_agent_name__version_number }}" - run_uuid: "{{ run_uuid }}" - scenario_number: "{{ scenario_number }}" - run_number: "{{ run_number }}" - s3_bucket_name_for_results: "{{ s3_bucket_name_for_results }}" - sre_bench_runner: "{{ sre_bench_runner | bool }}" - s3_endpoint_url: "{{ s3_endpoint_url }}" - register: json_output - - - name: (Task) Initialize alert recorder every minute - awx.awx.schedule: - controller_host: "{{ controller_host }}" - name: "Schedule for Alert Recorder - {{ scenario_number }}" - state: present - unified_job_template: "Scenario-{{ scenario_number }}--Setup-Alert-Recorder" - rrule: "DTSTART:{{ ansible_date_time.year }}{{ ansible_date_time.month }}{{ ansible_date_time.day }}T{{ ansible_date_time.hour }}{{ ansible_date_time.minute }}{{ ansible_date_time.second }}Z RRULE:INTERVAL=1;FREQ=MINUTELY" diff --git a/sre/playbooks/manage_incidents.yaml b/sre/playbooks/manage_incidents.yaml index d9387a5a..9669fa93 100644 --- a/sre/playbooks/manage_incidents.yaml +++ b/sre/playbooks/manage_incidents.yaml @@ -48,31 +48,23 @@ # when: # - incident.runner != 'local' - # - name: Import e2e role - # ansible.builtin.import_role: - # name: e2e - # tasks_from: record_topology_information - # tags: - # - pre_fault_removal - # when: - # - incident.runner != 'local' - - - name: Import faults role + - name: Import recorders role ansible.builtin.import_role: - name: faults + name: recorders vars: - faults_cluster: + recorders_cluster: kubeconfig: "{{ cluster.kubeconfig }}" - faults_specs: "{{ incidents_spec.spec.faults }}" + platform: "{{ cluster_platform }}" + recorders_enabled: + prometheus: "{{ tools_enabled.prometheus }}" - # - name: Import e2e role + # - name: Import faults role # ansible.builtin.import_role: - # name: e2e - # tasks_from: record_topology_information - # tags: - # - post_fault_injection - # when: - # - incident.runner != 'local' + # name: faults + # vars: + # faults_cluster: + # kubeconfig: "{{ cluster.kubeconfig }}" + # faults_specs: "{{ incidents_spec.spec.faults }}" # - name: Import e2e role # ansible.builtin.import_role: diff --git a/sre/roles/awx/tasks/configure_awx.yaml b/sre/roles/awx/tasks/configure_awx.yaml index 704b7fcd..eda5c7d6 100644 --- a/sre/roles/awx/tasks/configure_awx.yaml +++ b/sre/roles/awx/tasks/configure_awx.yaml @@ -105,23 +105,6 @@ when: - awx_credentials.aws is defined -# TODO: Remove this credential once the cronjobs are refactored - -- name: Add/Remove AWX credentials - awx.awx.credential: - # (Hack): As the value set using set_fact controller_host is not picked up - controller_host: "{{ awx_controller_host }}" - controller_password: "{{ awx_controller_password }}" # pragma: allowlist secret - controller_username: admin - name: Credential-AWX - description: Credential-AWX - organization: IT-Bench - credential_type: Red Hat Ansible Automation Platform - inputs: - host: "{{ awx_controller_host }}" - username: admin - password: "{{ awx_controller_password }}" # pragma: allowlist secret - - name: Create project for ITBench Repository awx.awx.project: controller_host: "{{ awx_controller_host }}" diff --git a/sre/roles/awx/tasks/configure_jobs.yaml b/sre/roles/awx/tasks/configure_jobs.yaml index 13d3dcf8..22966e18 100644 --- a/sre/roles/awx/tasks/configure_jobs.yaml +++ b/sre/roles/awx/tasks/configure_jobs.yaml @@ -267,60 +267,6 @@ project: GitHub-ITBench state: present -- name: Creating/removing job template to check for alerts - awx.awx.job_template: - controller_host: "{{ awx_controller_host }}" - controller_password: "{{ awx_controller_password }}" # pragma: allowlist secret - controller_username: admin - credentials: - - Cluster-{{ incident_index + 1 }}-Kubeconfig - - AWS - execution_environment: AWX-EE-Custom - job_type: run - name: "Scenario-{{ incident }}--Check-for-Alerts" - organization: ITBench-Scenarios - playbook: sre/cron_jobs/alert_recorder.yaml - project: GitHub-ITBench - state: present - -- name: Creating/removing job template to turn alert recorder on - awx.awx.job_template: - controller_host: "{{ awx_controller_host }}" - controller_password: "{{ awx_controller_password }}" # pragma: allowlist secret - controller_username: admin - credentials: - - Cluster-{{ incident_index + 1 }}-Kubeconfig - - Credential-AWX - execution_environment: AWX-EE-Custom - extra_vars: - controller_host: "{{ awx_controller_host }}" - scenario_number: "{{ incident }}" - job_type: run - name: "Scenario-{{ incident }}--Alert-Recorder-On" - organization: ITBench-Scenarios - playbook: sre/cron_jobs/alert_recorder_init.yaml - project: GitHub-ITBench - state: present - -- name: Creating/removing job template to turn alert recorder off - awx.awx.job_template: - controller_host: "{{ awx_controller_host }}" - controller_password: "{{ awx_controller_password }}" # pragma: allowlist secret - controller_username: admin - credentials: - - Cluster-{{ incident_index + 1 }}-Kubeconfig - - Credential-AWX - execution_environment: AWX-EE-Custom - extra_vars: - controller_host: "{{ awx_controller_host }}" - scenario_number: "{{ incident }}" - job_type: run - name: "Scenario-{{ incident }}--Alert-Recorder-Off" - organization: ITBench-Scenarios - playbook: sre/cron_jobs/alert_recorder_deinit.yaml - project: GitHub-ITBench - state: present - - name: Handover to Agent awx.awx.job_template: controller_host: "{{ awx_controller_host }}" diff --git a/sre/roles/awx/tasks/configure_workflows.yaml b/sre/roles/awx/tasks/configure_workflows.yaml index cb52f2c5..ff5d4811 100644 --- a/sre/roles/awx/tasks/configure_workflows.yaml +++ b/sre/roles/awx/tasks/configure_workflows.yaml @@ -69,26 +69,6 @@ unified_job_template: name: "Scenario-{{ incident }}--Post-Fault-Injection" type: job_template - related: - success_nodes: - - identifier: node-check-for-alerts - failure_nodes: - - identifier: node-capture-failed-deployment - always_nodes: [] - - identifier: node-check-for-alerts - unified_job_template: - name: "Scenario-{{ incident }}--Check-for-Alerts" - type: job_template - related: - success_nodes: - - identifier: node-alert-recorder-on - failure_nodes: - - identifier: node-capture-failed-deployment - always_nodes: [] - - identifier: node-alert-recorder-on - unified_job_template: - name: "Scenario-{{ incident }}--Alert-Recorder-On" - type: job_template related: success_nodes: - identifier: node-handover-to-agent @@ -135,15 +115,6 @@ unified_job_template: name: "Scenario-{{ incident }}--Uninstalling-K8s-Resources-Default-Namespace" type: job_template - related: - success_nodes: [] - failure_nodes: [] - always_nodes: - - identifier: node-alert-recorder-off - - identifier: node-alert-recorder-off - unified_job_template: - name: "Scenario-{{ incident }}--Alert-Recorder-Off" - type: job_template related: success_nodes: [] failure_nodes: [] diff --git a/sre/roles/recorders/defaults/main/namespace.yaml b/sre/roles/recorders/defaults/main/namespace.yaml new file mode 100644 index 00000000..138c47b4 --- /dev/null +++ b/sre/roles/recorders/defaults/main/namespace.yaml @@ -0,0 +1,3 @@ +--- +recorders_namespace: + name: data-recorders diff --git a/sre/roles/recorders/files/kubernetes/alerts/prometheus.yaml b/sre/roles/recorders/files/kubernetes/alerts/prometheus.yaml new file mode 100644 index 00000000..b6f2ad8e --- /dev/null +++ b/sre/roles/recorders/files/kubernetes/alerts/prometheus.yaml @@ -0,0 +1,71 @@ +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + labels: + app.kubernetes.io/name: prometheus-alert-recorder + app.kubernetes.io/part-of: it-bench + name: prometheus-alert-recorder +spec: + selector: + matchLabels: + app.kubernetes.io/name: prometheus-alert-recorder + app.kubernetes.io/part-of: it-bench + template: + metadata: + annotations: + openshift.io/required-scc: restricted-v2 + labels: + app.kubernetes.io/name: prometheus-alert-recorder + app.kubernetes.io/part-of: it-bench + spec: + containers: + - name: recorder + image: registry.access.redhat.com/ubi9/python-312:9.6-1754326132 + command: + - /bin/sh + args: + - -c + - "python3.12 -m pip install -r ~/deps/requirements.txt && python3.12 ~/scripts/gather.py" + resources: + requests: + cpu: 100m + memory: 125Mi + limits: + memory: 250Mi + volumeMounts: + - name: dependencies + mountPath: /opt/app-root/src/deps + readOnly: true + - name: scripts + mountPath: /opt/app-root/src/scripts + readOnly: true + - name: prometheus-alert-records + mountPath: /opt/app-root/src/records + securityContext: + fsGroup: 1001 + volumes: + - name: scripts + configMap: + name: alerts-recorder-prometheus-scripts + items: + - key: script + path: gather.py + - name: dependencies + configMap: + name: alerts-recorder-prometheus-scripts + items: + - key: deps + path: requirements.txt + replicas: 1 + volumeClaimTemplates: + - metadata: + name: prometheus-alert-records + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 1Gi + persistentVolumeClaimRetentionPolicy: + whenDeleted: Delete diff --git a/sre/roles/recorders/files/scripts/alerts/prometheus/gather.py b/sre/roles/recorders/files/scripts/alerts/prometheus/gather.py new file mode 100644 index 00000000..c2bfb577 --- /dev/null +++ b/sre/roles/recorders/files/scripts/alerts/prometheus/gather.py @@ -0,0 +1,64 @@ +import datetime +import json +import logging +import os +import sys +import time + +from datetime import datetime, timedelta, timezone + +import requests + +from requests.adapters import HTTPAdapter +from urllib3.util import Retry + +# Logging +logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) + +logger = logging.getLogger(__name__) + + +def main(): + endpoint = os.environ.get("PROMETHEUS_ENDPOINT") + if endpoint is None: + sys.exit("error: PROMETHEUS_ENDPOINT environment variable is not set") + + headers = { "Content-Type": "application/json" } + + token = os.environ.get("PROMETHEUS_TOKEN") + if token is not None: + headers["Authorization"] = token + + retries = Retry(total=3, backoff_factor=0.1) + adapter = HTTPAdapter(max_retries=retries) + + session = requests.Session() + session.mount("http://", adapter) + session.mount("https://", adapter) + + while True: + next_datetime = datetime.now() + timedelta(seconds=60) + + response = session.get("{0}/api/v1/alerts".format(endpoint), headers=headers, verify=True) + + if response.status_code != 200: + logger.warning("unable to query prometheus server") + else: + content = response.json() + alerts = content.get("data", {}).get("alerts", []) + + logger.info("retrieved {0} alerts from prometheus server".format(len(alerts))) + + utc_seconds = (datetime.now(timezone.utc) - datetime(1970, 1, 1, tzinfo=timezone.utc)).total_seconds() + file_path = os.path.join(os.path.expanduser("~"), "records", "{0}-alerts.json".format(round(utc_seconds))) + + with open(file_path, "w") as f: + json.dump(alerts, f, indent=4) + + sleep_interval = (next_datetime - datetime.now()).total_seconds() + if sleep_interval > 0: + logger.debug("sleep for {0} seconds".format(sleep_interval)) + time.sleep(sleep_interval) + +if __name__ == "__main__": + main() diff --git a/sre/roles/recorders/files/scripts/alerts/prometheus/requirements.txt b/sre/roles/recorders/files/scripts/alerts/prometheus/requirements.txt new file mode 100644 index 00000000..2c24336e --- /dev/null +++ b/sre/roles/recorders/files/scripts/alerts/prometheus/requirements.txt @@ -0,0 +1 @@ +requests==2.31.0 diff --git a/sre/roles/recorders/meta/argument_specs.yaml b/sre/roles/recorders/meta/argument_specs.yaml new file mode 100644 index 00000000..0f42c3d3 --- /dev/null +++ b/sre/roles/recorders/meta/argument_specs.yaml @@ -0,0 +1,53 @@ +--- +argument_specs: + main: + short_description: Main entry point for recorders role + description: + - This is the main entry point for the recorders role. + - This role is responsible for installing and uninstalling data recorders. + author: + - Gerard Vanloo + options: + recorders_cluster: + required: false + type: dict + options: + kubeconfig: + default: ~/.kube/config + required: false + type: str + platform: + choices: + - kubernetes + - openshift + default: kubernetes + required: false + type: str + recorders_enabled: + required: false + type: dict + options: + alerts: + required: false + type: dict + options: + prometheus: + default: true + required: false + type: bool + topology: + required: false + type: dict + options: + kubernetes: + default: true + required: false + type: bool + traces: + required: false + type: dict + options: + jaeger: + default: true + required: false + type: bool diff --git a/sre/roles/recorders/tasks/install.yaml b/sre/roles/recorders/tasks/install.yaml new file mode 100644 index 00000000..6c80eab6 --- /dev/null +++ b/sre/roles/recorders/tasks/install.yaml @@ -0,0 +1,16 @@ +--- +- name: Create the namespace + kubernetes.core.k8s: + kubeconfig: "{{ recorders_cluster.kubeconfig }}" + resource_definition: + apiVersion: v1 + kind: Namespace + metadata: + name: "{{ recorders_namespace.name }}" + state: present + +- name: Import alert recorder installation tasks + ansible.builtin.import_tasks: + file: install_alert_recorders.yaml + when: + - recorders_enabled.alerts is defined diff --git a/sre/roles/recorders/tasks/install_alerts_recorders.yaml b/sre/roles/recorders/tasks/install_alerts_recorders.yaml new file mode 100644 index 00000000..e409249e --- /dev/null +++ b/sre/roles/recorders/tasks/install_alerts_recorders.yaml @@ -0,0 +1,6 @@ +--- +- name: Import Prometheus installation tasks + ansible.builtin.import_tasks: + file: install_alerts_recorders_prometheus.yaml + when: + - recorders_enabled.alerts.prometheus | default(true) diff --git a/sre/roles/recorders/tasks/install_alerts_recorders_prometheus.yaml b/sre/roles/recorders/tasks/install_alerts_recorders_prometheus.yaml new file mode 100644 index 00000000..8612ad79 --- /dev/null +++ b/sre/roles/recorders/tasks/install_alerts_recorders_prometheus.yaml @@ -0,0 +1,88 @@ +--- +- name: Import tools role for variable setting tasks + ansible.builtin.import_role: + name: tools + tasks_from: set_prometheus_endpoint + vars: + tools_cluster: + kubeconfig: "{{ recorders_cluster.kubeconfig }}" + platform: "{{ recorders_cluster.platform }}" + +- name: Create Secret with Prometheus bearer token + kubernetes.core.k8s: + kubeconfig: "{{ recorders_cluster.kubeconfig }}" + resource_definition: + apiVersion: v1 + kind: Secret + metadata: + name: alerts-recorder-prometheus-token + namespace: "{{ recorders_namespace.name }}" + data: + token: "{{ tools_prometheus_bearer_token }}" + state: present + when: + - recorders_cluster.platform == "openshift" + - tools_prometheus_bearer_token is defined + +- name: Create ConfigMap with Python script + kubernetes.core.k8s: + kubeconfig: "{{ recorders_cluster.kubeconfig }}" + resource_definition: + apiVersion: v1 + kind: ConfigMap + metadata: + name: alerts-recorder-prometheus-scripts + namespace: "{{ recorders_namespace.name }}" + data: + deps: "{{ lookup('ansible.builtin.file', 'files/scripts/alerts/prometheus/requirements.txt') }}" + script: "{{ lookup('ansible.builtin.file', 'files/scripts/alerts/prometheus/gather.py') }}" + state: present + +- name: Install Prometheus Alert Recorder + kubernetes.core.k8s: + kubeconfig: "{{ recorders_cluster.kubeconfig }}" + namespace: "{{ recorders_namespace.name }}" + src: files/kubernetes/alerts/prometheus.yaml + state: present + +- name: Create Prometheus Alert Recorder environment list + ansible.builtin.set_fact: + recorders_prometheus_env_vars: + - name: PROMETHEUS_ENDPOINT + value: "{{ tools_prometheus_endpoint }}" + +- name: Add Secret to environment list + ansible.builtin.set_fact: + recorders_prometheus_env_vars: | + {{ + recorders_prometheus_env_vars + + [{ + 'name': PROMETHEUS_TOKEN, + 'valueFrom': { + 'secretKeyRef': { + 'name': 'alerts-recorder-prometheus-token' + 'key': 'token' + } + } + }] + }} + when: + - recorders_cluster.platform == "openshift" + - tools_prometheus_bearer_token is defined + +- name: Update Prometheus Alert Recorder environment variables + kubernetes.core.k8s: + kubeconfig: "{{ recorders_cluster.kubeconfig }}" + resource_definition: + apiVersion: apps/v1 + kind: StatefulSet + metadata: + name: prometheus-alert-recorder + namespace: "{{ recorders_namespace.name }}" + spec: + template: + spec: + containers: + - name: recorder + env: "{{ recorders_prometheus_env_vars }}" + state: patched diff --git a/sre/roles/recorders/tasks/main.yaml b/sre/roles/recorders/tasks/main.yaml new file mode 100644 index 00000000..7ef96d92 --- /dev/null +++ b/sre/roles/recorders/tasks/main.yaml @@ -0,0 +1,12 @@ +--- +- name: Import installation tasks + ansible.builtin.import_tasks: + file: install.yaml + tags: + - install_recorders + +- name: Import uninstallation tasks + ansible.builtin.import_tasks: + file: uninstall.yaml + tags: + - uninstall_recorders diff --git a/sre/roles/recorders/tasks/uninstall.yaml b/sre/roles/recorders/tasks/uninstall.yaml new file mode 100644 index 00000000..62cb12f0 --- /dev/null +++ b/sre/roles/recorders/tasks/uninstall.yaml @@ -0,0 +1,17 @@ +--- +- name: Import alert recorder uninstallation tasks + ansible.builtin.import_tasks: + file: uninstall_alert_recorders.yaml + when: + - recorders_enabled.alerts is defined + +- name: Delete the namespace + kubernetes.core.k8s: + kubeconfig: "{{ recorders_cluster.kubeconfig }}" + resource_definition: + apiVersion: v1 + kind: Namespace + metadata: + name: "{{ recorders_namespace.name }}" + state: absent + wait: true diff --git a/sre/roles/recorders/tasks/uninstall_alerts_recorders.yaml b/sre/roles/recorders/tasks/uninstall_alerts_recorders.yaml new file mode 100644 index 00000000..438d0610 --- /dev/null +++ b/sre/roles/recorders/tasks/uninstall_alerts_recorders.yaml @@ -0,0 +1,6 @@ +--- +- name: Import Prometheus uninstallation tasks + ansible.builtin.import_tasks: + file: uninstall_alerts_recorders_prometheus.yaml + when: + - recorders_enabled.alerts.prometheus | default(true) diff --git a/sre/roles/recorders/tasks/uninstall_alerts_recorders_prometheus.yaml b/sre/roles/recorders/tasks/uninstall_alerts_recorders_prometheus.yaml new file mode 100644 index 00000000..917031e2 --- /dev/null +++ b/sre/roles/recorders/tasks/uninstall_alerts_recorders_prometheus.yaml @@ -0,0 +1,70 @@ +--- +- name: Retrieve the alert recorder pod name + kubernetes.core.k8s_info: + api_version: v1 + kind: Pod + kubeconfig: "{{ recorders_cluster.kubeconfig }}" + namespace: "{{ recorders_namespace.name }}" + label_selectors: + - app.kubernetes.io/name = prometheus-alert-recorder + - app.kubernetes.io/part-of = it-bench + register: recorders_pods_info + +- name: Copy records directory from pod + kubernetes.core.k8s_cp: + kubeconfig: "{{ recorders_cluster.kubeconfig }}" + local_path: /tmp/alerts + namespace: "{{ recorders_pods_info.resources[0].metadata.namespace }}" + pod: "{{ recorders_pods_info.resources[0].metadata.name }}" + remote_path: /opt/app-root/src/records + state: from_pod + when: + - recorders_pods_info is defined + - recorders_pods_info.resources | length == 1 + +- name: Uninstall Prometheus Alert Recorder + kubernetes.core.k8s: + kubeconfig: "{{ recorders_cluster.kubeconfig }}" + namespace: "{{ recorders_namespace.name }}" + src: files/kubernetes/alerts/prometheus.yaml + state: absent + wait: true + +- name: Delete ConfigMap with Python script + kubernetes.core.k8s: + kubeconfig: "{{ recorders_cluster.kubeconfig }}" + resource_definition: + apiVersion: v1 + kind: ConfigMap + metadata: + name: alerts-recorder-prometheus-scripts + namespace: "{{ recorders_namespace.name }}" + state: absent + wait: true + +- name: Check for Secret with Prometheus bearer token + kubernetes.core.k8s_info: + api_version: route.openshift.io/v1 + kind: Secret + kubeconfig: "{{ recorders_cluster.kubeconfig }}" + name: alerts-recorder-prometheus-token + namespace: "{{ recorders_namespace.name }}" + register: recorders_prometheus_secret_info + when: + - recorders_cluster.platform == "openshift" + +- name: Delete Secret with Prometheus bearer token + kubernetes.core.k8s: + kubeconfig: "{{ recorders_cluster.kubeconfig }}" + resource_definition: + apiVersion: v1 + kind: Secret + metadata: + name: "{{ recorders_prometheus_secret_info.resources[0].metadata.name }}" + namespace: "{{ recorders_prometheus_secret_info.resources[0].metadata.namespace }}" + state: absent + wait: true + when: + - recorders_cluster.platform == "openshift" + - recorders_prometheus_secret_info is defined + - recorders_prometheus_secret_info.resources | length == 1 diff --git a/sre/roles/tools/meta/argument_specs.yaml b/sre/roles/tools/meta/argument_specs.yaml index ed08f8e5..8ccf4b72 100644 --- a/sre/roles/tools/meta/argument_specs.yaml +++ b/sre/roles/tools/meta/argument_specs.yaml @@ -80,3 +80,26 @@ argument_specs: kubeconfig: required: true type: str + set_prometheus_endpoint: + short_description: Main entry point for tools role + description: + - This is the main entry point for the tools role. + - This role is responsible for installing and uninstalling tools and services. + author: + - Gerard Vanloo + options: + tools_cluster: + required: false + type: dict + options: + kubeconfig: + default: ~/.kube/config + required: false + type: str + platform: + choices: + - kubernetes + - openshift + default: kubernetes + required: false + type: str diff --git a/sre/roles/tools/tasks/install_opencost.yaml b/sre/roles/tools/tasks/install_opencost.yaml index d9513d05..b2c87006 100644 --- a/sre/roles/tools/tasks/install_opencost.yaml +++ b/sre/roles/tools/tasks/install_opencost.yaml @@ -15,32 +15,9 @@ it-bench/monitoring: "true" state: present -- name: Create bearer token - ansible.builtin.command: - cmd: oc whoami -t - environment: - KUBECONFIG: "{{ tools_cluster.kubeconfig | ansible.builtin.expanduser }}" - register: tools_prometheus_token - changed_when: false - when: - - tools_cluster.platform == "openshift" - -- name: Retrieve Prometheus route - kubernetes.core.k8s_info: - api_version: route.openshift.io/v1 - kind: Route - kubeconfig: "{{ tools_cluster.kubeconfig }}" - name: prometheus-k8s - namespace: openshift-monitoring - register: tools_prometheus_k8s_route_info - when: - - tools_cluster.platform == "openshift" - -- name: Parse Prometheus hostname - ansible.builtin.set_fact: - tools_prometheus_k8s_url: https://{{ tools_prometheus_k8s_route_info.resources[0].spec.host }} - when: - - tools_cluster.platform == "openshift" +- name: Import variable setting task + ansible.builtin.import_tasks: + file: set_prometheus_endpoint.yaml - name: Install OpenCost kubernetes.core.helm: @@ -80,10 +57,10 @@ openshift: enabled: "{{ tools_cluster.platform == 'openshift' }}" prometheus: - bearer_token: "{{ tools_prometheus_token.stdout if tools_cluster.platform == 'openshift' else omit }}" + bearer_token: "{{ tools_prometheus_bearer_token | default(omit) }}" external: enabled: "{{ tools_cluster.platform == 'openshift' }}" - url: "{{ tools_prometheus_k8s_url | default(omit) }}" + url: "{{ tools_prometheus_endpoint | default(omit) }}" internal: enabled: "{{ tools_cluster.platform == 'kubernetes' }}" serviceName: "{{ tools_helm_releases.prometheus.name }}-kube-prometheus-prometheus" diff --git a/sre/roles/tools/tasks/set_prometheus_endpoint.yaml b/sre/roles/tools/tasks/set_prometheus_endpoint.yaml new file mode 100644 index 00000000..6badc882 --- /dev/null +++ b/sre/roles/tools/tasks/set_prometheus_endpoint.yaml @@ -0,0 +1,53 @@ +--- +- name: Retrieve bearer token from OpenShift CLI + ansible.builtin.command: + cmd: oc whoami -t + environment: + KUBECONFIG: "{{ tools_cluster.kubeconfig | ansible.builtin.expanduser }}" + register: tools_oc_user_token + changed_when: false + when: + - tools_cluster.platform == "openshift" + +- name: Retrieve OpenShift Prometheus Route + kubernetes.core.k8s_info: + api_version: route.openshift.io/v1 + kind: Route + kubeconfig: "{{ tools_cluster.kubeconfig }}" + name: prometheus-k8s + namespace: openshift-monitoring + register: tools_prometheus_k8s_route_info + when: + - tools_cluster.platform == "openshift" + +- name: Extract Prometheus hostname and bearer token + ansible.builtin.set_fact: + tools_prometheus_bearer_token: "{{ tools_oc_user_token.stdout }}" + tools_prometheus_endpoint: https://{{ tools_prometheus_k8s_route_info.resources[0].spec.host }} + when: + - tools_cluster.platform == "openshift" + - tools_oc_user_token is defined + - tools_oc_user_token.rc == 0 + - tools_prometheus_k8s_route_info is defined + - tools_prometheus_k8s_route_info.resources | length == 1 + +- name: Retrieve Prometheus service info + kubernetes.core.k8s_info: + api_version: v1 + kind: Service + kubeconfig: "{{ tools_cluster.kubeconfig }}" + namespace: "{{ tools_helm_releases.prometheus.namespace }}" + label_selectors: + - app = kube-prometheus-stack-prometheus + register: tools_prometheus_service_info + when: + - tools_cluster.platform == "kubernetes" + +- name: Extract Prometheus hostname + ansible.builtin.set_fact: + tools_prometheus_endpoint: http://{{ tools_prometheus_service_info.resources[0].metadata.name }}.{{ + tools_prometheus_service_info.resources[0].metadata.namespace }}.svc.cluster.local:9090 + when: + - tools_cluster.platform == "kubernetes" + - tools_prometheus_service_info is defined + - tools_prometheus_service_info.resources | length == 1 From 9b10f6b33e13a75ff04a9f4455decb893f99cbde Mon Sep 17 00:00:00 2001 From: Gerard Vanloo Date: Wed, 13 Aug 2025 15:17:32 -0400 Subject: [PATCH 03/35] feat: replace post and pre fault injection/removal tasks with recorders role Signed-off-by: Gerard Vanloo --- sre/playbooks/manage_incidents.yaml | 5 +- sre/roles/awx/tasks/configure_jobs.yaml | 34 --------- sre/roles/awx/tasks/configure_workflows.yaml | 10 --- sre/roles/e2e/tasks/leverage_ingress.yaml | 18 ----- .../e2e/tasks/leverage_port_forwarding.yaml | 34 --------- .../tasks/record_topology_information.yaml | 47 ------------ .../files/kubernetes/topology/kubernetes.yaml | 71 +++++++++++++++++++ .../scripts/topology/kubernetes/gather.py | 60 ++++++++++++++++ .../topology/kubernetes/requirements.txt | 1 + sre/roles/recorders/tasks/install.yaml | 6 ++ .../tasks/install_topology_recorders.yaml | 6 ++ ...install_topology_recorders_kubernetes.yaml | 49 +++++++++++++ sre/roles/recorders/tasks/uninstall.yaml | 6 ++ .../tasks/uninstall_topology_recorders.yaml | 6 ++ ...install_topology_recorders_kubernetes.yaml | 43 +++++++++++ ..._kubernetes_topology_monitor_endpoint.yaml | 19 +++++ 16 files changed, 271 insertions(+), 144 deletions(-) delete mode 100644 sre/roles/e2e/tasks/leverage_ingress.yaml delete mode 100644 sre/roles/e2e/tasks/leverage_port_forwarding.yaml delete mode 100644 sre/roles/e2e/tasks/record_topology_information.yaml create mode 100644 sre/roles/recorders/files/kubernetes/topology/kubernetes.yaml create mode 100644 sre/roles/recorders/files/scripts/topology/kubernetes/gather.py create mode 100644 sre/roles/recorders/files/scripts/topology/kubernetes/requirements.txt create mode 100644 sre/roles/recorders/tasks/install_topology_recorders.yaml create mode 100644 sre/roles/recorders/tasks/install_topology_recorders_kubernetes.yaml create mode 100644 sre/roles/recorders/tasks/uninstall_topology_recorders.yaml create mode 100644 sre/roles/recorders/tasks/uninstall_topology_recorders_kubernetes.yaml create mode 100644 sre/roles/tools/tasks/set_kubernetes_topology_monitor_endpoint.yaml diff --git a/sre/playbooks/manage_incidents.yaml b/sre/playbooks/manage_incidents.yaml index 9669fa93..950fde5b 100644 --- a/sre/playbooks/manage_incidents.yaml +++ b/sre/playbooks/manage_incidents.yaml @@ -56,7 +56,10 @@ kubeconfig: "{{ cluster.kubeconfig }}" platform: "{{ cluster_platform }}" recorders_enabled: - prometheus: "{{ tools_enabled.prometheus }}" + alerts: + prometheus: "{{ tools_enabled.prometheus }}" + topology: + kubernetes: "{{ tools_enabled.kubernetes_topology_monitor }}" # - name: Import faults role # ansible.builtin.import_role: diff --git a/sre/roles/awx/tasks/configure_jobs.yaml b/sre/roles/awx/tasks/configure_jobs.yaml index 22966e18..4eb7eb5c 100644 --- a/sre/roles/awx/tasks/configure_jobs.yaml +++ b/sre/roles/awx/tasks/configure_jobs.yaml @@ -169,40 +169,6 @@ project: GitHub-ITBench state: present -- name: Creating/removing job template for post fault injection task(s) - awx.awx.job_template: - controller_host: "{{ awx_controller_host }}" - controller_password: "{{ awx_controller_password }}" # pragma: allowlist secret - controller_username: admin - credentials: - - Cluster-{{ incident_index + 1 }}-Kubeconfig - - AWS - execution_environment: AWX-EE-Custom - job_tags: post_fault_injection - job_type: run - name: "Scenario-{{ incident }}--Post-Fault-Injection" - organization: ITBench-Scenarios - playbook: sre/base.yaml - project: GitHub-ITBench - state: present - -- name: Creating/removing job template for pre fault removal task(s) - awx.awx.job_template: - controller_host: "{{ awx_controller_host }}" - controller_password: "{{ awx_controller_password }}" # pragma: allowlist secret - controller_username: admin - credentials: - - Cluster-{{ incident_index + 1 }}-Kubeconfig - - AWS - execution_environment: AWX-EE-Custom - job_tags: pre_fault_removal - job_type: run - name: "Scenario-{{ incident }}--Post-Fault-Injection" - organization: ITBench-Scenarios - playbook: sre/base.yaml - project: GitHub-ITBench - state: present - - name: Capture failed deployment state awx.awx.job_template: controller_host: "{{ awx_controller_host }}" diff --git a/sre/roles/awx/tasks/configure_workflows.yaml b/sre/roles/awx/tasks/configure_workflows.yaml index ff5d4811..d5789267 100644 --- a/sre/roles/awx/tasks/configure_workflows.yaml +++ b/sre/roles/awx/tasks/configure_workflows.yaml @@ -59,16 +59,6 @@ unified_job_template: name: "Incident {{ incident }}: Inject Faults" type: job_template - related: - success_nodes: - - identifier: node-post-fault-injection - failure_nodes: - - identifier: node-capture-failed-deployment - always_nodes: [] - - identifier: node-post-fault-injection - unified_job_template: - name: "Scenario-{{ incident }}--Post-Fault-Injection" - type: job_template related: success_nodes: - identifier: node-handover-to-agent diff --git a/sre/roles/e2e/tasks/leverage_ingress.yaml b/sre/roles/e2e/tasks/leverage_ingress.yaml deleted file mode 100644 index ff658130..00000000 --- a/sre/roles/e2e/tasks/leverage_ingress.yaml +++ /dev/null @@ -1,18 +0,0 @@ ---- -- name: Get the Ingress URL for the Observability Stack - ansible.builtin.shell: "KUBECONFIG={{ kubeconfig }} kubectl -n kube-system get ingress topology-monitor -o json" - register: observability_stack_ingress - retries: 5 - delay: 60 - until: (observability_stack_ingress.stdout | length) > 0 - ignore_errors: yes - -- name: Extract the Ingress hostname information - set_fact: - ingress_hostname: "{{ observability_stack_ingress.stdout | from_json | json_query('status.loadBalancer.ingress[0].hostname') }}" - when: observability_stack_ingress.stdout | trim != '' - -- name: Set the Topology URL - set_fact: - topology_url: "http://{{ ingress_hostname }}/topology" - when: ingress_hostname is defined and ingress_hostname | trim != '' diff --git a/sre/roles/e2e/tasks/leverage_port_forwarding.yaml b/sre/roles/e2e/tasks/leverage_port_forwarding.yaml deleted file mode 100644 index fb464842..00000000 --- a/sre/roles/e2e/tasks/leverage_port_forwarding.yaml +++ /dev/null @@ -1,34 +0,0 @@ ---- -- name: Check availability of ports - ansible.builtin.shell: | - lsof -i :{{ item }} > /dev/null && echo "in_use" || echo "available" - register: lsof_check - loop: "{{ range(32100, 32125) | list }}" - changed_when: false - failed_when: false - loop_control: - loop_var: item - when: ingress_hostname is undefined or ingress_hostname | trim == '' - -- name: Assign a dynamic port if one is available - set_fact: - dynamic_port: "{{ (lsof_check.results | selectattr('stdout', 'equalto', 'available') | map(attribute='item') | list | first) }}" - when: ingress_hostname is undefined or ingress_hostname | trim == '' - -- name: Kubectl port-forward on/for the Topology Monitor - ansible.builtin.shell: KUBECONFIG={{ kubeconfig }} kubectl -n kube-system port-forward "svc/topology-monitor" "{{ dynamic_port }}:8080" --request-timeout=15m - async: 900 - poll: 0 - register: topology_port_forward_for_topology_collection_post_fault_injection - when: ingress_hostname is undefined or ingress_hostname | trim == '' - -- name: Wait for port-forward to be available - ansible.builtin.wait_for_connection: - delay: 5 - timeout: 30 - when: ingress_hostname is undefined or ingress_hostname | trim == '' - -- name: Set the Topology URL - set_fact: - topology_url: "http://127.0.0.1:{{ dynamic_port }}" - when: ingress_hostname is undefined or ingress_hostname | trim == '' diff --git a/sre/roles/e2e/tasks/record_topology_information.yaml b/sre/roles/e2e/tasks/record_topology_information.yaml deleted file mode 100644 index 8ab9feb9..00000000 --- a/sre/roles/e2e/tasks/record_topology_information.yaml +++ /dev/null @@ -1,47 +0,0 @@ ---- -- name: Apply tags to tasks within included file - ansible.builtin.import_tasks: - file: leverage_ingress.yaml - -- name: Apply tags to tasks within included file - ansible.builtin.import_tasks: - file: leverage_port_forwarding.yaml - -- name: Call the Topology APIs - uri: - url: "{{ topology_url }}/{{ item }}" - method: GET - return_content: yes - body_format: json - headers: - Content-Type: "application/json" - with_items: - - "nodes" - - "edges" - - "graph" - - "events" - register: topology_responses - retries: 10 - delay: 5 - until: topology_responses is succeeded - -- name: Ensure topology_information directory exists - ansible.builtin.file: - path: "/runner/topology_information" - state: directory - when: run_uuid is defined and scenario_number is defined and run_number is defined - -- name: Copy content to file(s) - ansible.builtin.copy: - content: "{{ item.content | to_json }}" - dest: "/runner/topology_information/start_{{ item.item }}_{{now(utc=true,fmt='%Y-%m-%dT%H:%M:%S.%f')}}.json" - with_items: "{{ topology_responses.results }}" - when: run_uuid is defined and scenario_number is defined and run_number is defined - -- name: Upload structured-unstructured outputs to S3 - community.aws.s3_sync: - bucket: "{{ s3_bucket_name_for_results }}" - file_root: "/runner/topology_information" - key_prefix: "{{ sre_agent_name__version_number }}/{{run_uuid}}/{{scenario_number}}/{{run_number}}/topology_information" - region: "us-east-2" - when: run_uuid is defined and scenario_number is defined and run_number is defined diff --git a/sre/roles/recorders/files/kubernetes/topology/kubernetes.yaml b/sre/roles/recorders/files/kubernetes/topology/kubernetes.yaml new file mode 100644 index 00000000..65f7cb63 --- /dev/null +++ b/sre/roles/recorders/files/kubernetes/topology/kubernetes.yaml @@ -0,0 +1,71 @@ +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + labels: + app.kubernetes.io/name: kubernetes-topology-recorder + app.kubernetes.io/part-of: it-bench + name: kubernetes-topology-recorder +spec: + selector: + matchLabels: + app.kubernetes.io/name: kubernetes-topology-recorder + app.kubernetes.io/part-of: it-bench + template: + metadata: + annotations: + openshift.io/required-scc: restricted-v2 + labels: + app.kubernetes.io/name: kubernetes-topology-recorder + app.kubernetes.io/part-of: it-bench + spec: + containers: + - name: recorder + image: registry.access.redhat.com/ubi9/python-312:9.6-1754326132 + command: + - /bin/sh + args: + - -c + - "python3.12 -m pip install -r ~/deps/requirements.txt && python3.12 ~/scripts/gather.py" + resources: + requests: + cpu: 100m + memory: 125Mi + limits: + memory: 250Mi + volumeMounts: + - name: dependencies + mountPath: /opt/app-root/src/deps + readOnly: true + - name: scripts + mountPath: /opt/app-root/src/scripts + readOnly: true + - name: kubernetes-topology-records + mountPath: /opt/app-root/src/records + securityContext: + fsGroup: 1001 + volumes: + - name: scripts + configMap: + name: topology-recorder-kubernetes-scripts + items: + - key: script + path: gather.py + - name: dependencies + configMap: + name: topology-recorder-kubernetes-scripts + items: + - key: deps + path: requirements.txt + replicas: 1 + volumeClaimTemplates: + - metadata: + name: kubernetes-topology-records + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 1Gi + persistentVolumeClaimRetentionPolicy: + whenDeleted: Delete diff --git a/sre/roles/recorders/files/scripts/topology/kubernetes/gather.py b/sre/roles/recorders/files/scripts/topology/kubernetes/gather.py new file mode 100644 index 00000000..7d70a347 --- /dev/null +++ b/sre/roles/recorders/files/scripts/topology/kubernetes/gather.py @@ -0,0 +1,60 @@ +import datetime +import json +import logging +import os +import sys +import time + +from datetime import datetime, timedelta, timezone + +import requests + +from requests.adapters import HTTPAdapter +from urllib3.util import Retry + +# Logging +logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) + +logger = logging.getLogger(__name__) + + +def main(): + endpoint = os.environ.get("KUBERNETES_TOPOLOGY_MONITOR_ENDPOINT") + if endpoint is None: + sys.exit("error: KUBERNETES_TOPOLOGY_MONITOR_ENDPOINT environment variable is not set") + + headers = { "Content-Type": "application/json" } + + retries = Retry(total=3, backoff_factor=0.1) + adapter = HTTPAdapter(max_retries=retries) + + session = requests.Session() + session.mount("http://", adapter) + session.mount("https://", adapter) + + while True: + next_datetime = datetime.now() + timedelta(seconds=60) + + for item in ["nodes", "edges", "graph", "events"]: + response = session.get("{0}/{1}".format(endpoint, item), headers=headers, verify=True) + + if response.status_code != 200: + logger.warning("unable to query kubernetes topology mapper for {0}".format(item)) + else: + content = response.json() + + logger.info("retrieved {0} data".format(item)) + + utc_seconds = (datetime.now(timezone.utc) - datetime(1970, 1, 1, tzinfo=timezone.utc)).total_seconds() + file_path = os.path.join(os.path.expanduser("~"), "records", "{0}-{1}.json".format(round(utc_seconds), item)) + + with open(file_path, "w") as f: + json.dump(content, f, indent=4) + + sleep_interval = (next_datetime - datetime.now()).total_seconds() + if sleep_interval > 0: + logger.debug("sleep for {0} seconds".format(sleep_interval)) + time.sleep(sleep_interval) + +if __name__ == "__main__": + main() diff --git a/sre/roles/recorders/files/scripts/topology/kubernetes/requirements.txt b/sre/roles/recorders/files/scripts/topology/kubernetes/requirements.txt new file mode 100644 index 00000000..2c24336e --- /dev/null +++ b/sre/roles/recorders/files/scripts/topology/kubernetes/requirements.txt @@ -0,0 +1 @@ +requests==2.31.0 diff --git a/sre/roles/recorders/tasks/install.yaml b/sre/roles/recorders/tasks/install.yaml index 6c80eab6..52ffd8c6 100644 --- a/sre/roles/recorders/tasks/install.yaml +++ b/sre/roles/recorders/tasks/install.yaml @@ -14,3 +14,9 @@ file: install_alert_recorders.yaml when: - recorders_enabled.alerts is defined + +- name: Import topology recorder installation tasks + ansible.builtin.import_tasks: + file: install_topology_recorders.yaml + when: + - recorders_enabled.topology is defined diff --git a/sre/roles/recorders/tasks/install_topology_recorders.yaml b/sre/roles/recorders/tasks/install_topology_recorders.yaml new file mode 100644 index 00000000..be1a4c3f --- /dev/null +++ b/sre/roles/recorders/tasks/install_topology_recorders.yaml @@ -0,0 +1,6 @@ +--- +- name: Import Kubernetes Topology Monitor installation tasks + ansible.builtin.import_tasks: + file: install_topology_recorders_kubernetes.yaml + when: + - recorders_enabled.alerts.kubernetes | default(true) diff --git a/sre/roles/recorders/tasks/install_topology_recorders_kubernetes.yaml b/sre/roles/recorders/tasks/install_topology_recorders_kubernetes.yaml new file mode 100644 index 00000000..aa613b5a --- /dev/null +++ b/sre/roles/recorders/tasks/install_topology_recorders_kubernetes.yaml @@ -0,0 +1,49 @@ +--- +- name: Import tools role for variable setting tasks + ansible.builtin.import_role: + name: tools + tasks_from: set_kubernetes_topology_monitor_endpoint + vars: + tools_cluster: + kubeconfig: "{{ recorders_cluster.kubeconfig }}" + platform: "{{ recorders_cluster.platform }}" + +- name: Create ConfigMap with Python script + kubernetes.core.k8s: + kubeconfig: "{{ recorders_cluster.kubeconfig }}" + resource_definition: + apiVersion: v1 + kind: ConfigMap + metadata: + name: topology-recorder-kubernetes-scripts + namespace: "{{ recorders_namespace.name }}" + data: + deps: "{{ lookup('ansible.builtin.file', 'files/scripts/alerts/prometheus/requirements.txt') }}" + script: "{{ lookup('ansible.builtin.file', 'files/scripts/alerts/prometheus/gather.py') }}" + state: present + +- name: Install Kubernetes Topology Recorder + kubernetes.core.k8s: + kubeconfig: "{{ recorders_cluster.kubeconfig }}" + namespace: "{{ recorders_namespace.name }}" + src: files/kubernetes/topology/kubernetes.yaml + state: present + +- name: Update Kubernetes Topology Recorder environment variables + kubernetes.core.k8s: + kubeconfig: "{{ recorders_cluster.kubeconfig }}" + resource_definition: + apiVersion: apps/v1 + kind: StatefulSet + metadata: + name: kubernetes-topology-recorder + namespace: "{{ recorders_namespace.name }}" + spec: + template: + spec: + containers: + - name: recorder + env: + - name: KUBERNETES_TOPOLOGY_MONITOR_ENDPOINT + value: "{{ tools_kubernetes_topology_mapper_endpoint }}" + state: patched diff --git a/sre/roles/recorders/tasks/uninstall.yaml b/sre/roles/recorders/tasks/uninstall.yaml index 62cb12f0..ae325990 100644 --- a/sre/roles/recorders/tasks/uninstall.yaml +++ b/sre/roles/recorders/tasks/uninstall.yaml @@ -5,6 +5,12 @@ when: - recorders_enabled.alerts is defined +- name: Import topology recorder uninstallation tasks + ansible.builtin.import_tasks: + file: uninstall_topology_recorders.yaml + when: + - recorders_enabled.topology is defined + - name: Delete the namespace kubernetes.core.k8s: kubeconfig: "{{ recorders_cluster.kubeconfig }}" diff --git a/sre/roles/recorders/tasks/uninstall_topology_recorders.yaml b/sre/roles/recorders/tasks/uninstall_topology_recorders.yaml new file mode 100644 index 00000000..1b3ccf60 --- /dev/null +++ b/sre/roles/recorders/tasks/uninstall_topology_recorders.yaml @@ -0,0 +1,6 @@ +--- +- name: Import Kubernetes Topology Monitor uninstallation tasks + ansible.builtin.import_tasks: + file: uninstall_topology_recorders_kubernetes.yaml + when: + - recorders_enabled.alerts.kubernetes | default(true) diff --git a/sre/roles/recorders/tasks/uninstall_topology_recorders_kubernetes.yaml b/sre/roles/recorders/tasks/uninstall_topology_recorders_kubernetes.yaml new file mode 100644 index 00000000..fa9662e2 --- /dev/null +++ b/sre/roles/recorders/tasks/uninstall_topology_recorders_kubernetes.yaml @@ -0,0 +1,43 @@ +--- +- name: Retrieve the topology recorder pod name + kubernetes.core.k8s_info: + api_version: v1 + kind: Pod + kubeconfig: "{{ recorders_cluster.kubeconfig }}" + namespace: "{{ recorders_namespace.name }}" + label_selectors: + - app.kubernetes.io/name = kubernetes-topology-recorder + - app.kubernetes.io/part-of = it-bench + register: recorders_pods_info + +- name: Copy records directory from pod + kubernetes.core.k8s_cp: + kubeconfig: "{{ recorders_cluster.kubeconfig }}" + local_path: /tmp/topology + namespace: "{{ recorders_pods_info.resources[0].metadata.namespace }}" + pod: "{{ recorders_pods_info.resources[0].metadata.name }}" + remote_path: /opt/app-root/src/records + state: from_pod + when: + - recorders_pods_info is defined + - recorders_pods_info.resources | length == 1 + +- name: Uninstall Kubernetes Topology Monitor Recorder + kubernetes.core.k8s: + kubeconfig: "{{ recorders_cluster.kubeconfig }}" + namespace: "{{ recorders_namespace.name }}" + src: files/kubernetes/topology/kubernetes.yaml + state: absent + wait: true + +- name: Delete ConfigMap with Python script + kubernetes.core.k8s: + kubeconfig: "{{ recorders_cluster.kubeconfig }}" + resource_definition: + apiVersion: v1 + kind: ConfigMap + metadata: + name: topology-recorder-kubernetes-scripts + namespace: "{{ recorders_namespace.name }}" + state: absent + wait: true diff --git a/sre/roles/tools/tasks/set_kubernetes_topology_monitor_endpoint.yaml b/sre/roles/tools/tasks/set_kubernetes_topology_monitor_endpoint.yaml new file mode 100644 index 00000000..77a0782c --- /dev/null +++ b/sre/roles/tools/tasks/set_kubernetes_topology_monitor_endpoint.yaml @@ -0,0 +1,19 @@ +--- +- name: Retrieve Prometheus service info + kubernetes.core.k8s_info: + api_version: v1 + kind: Service + kubeconfig: "{{ tools_cluster.kubeconfig }}" + namespace: "{{ tools_helm_releases.kubernetes_topology_monitor.namespace }}" + label_selectors: + - app = topology-monitor + register: tools_topology_service_info + +- name: Extract Prometheus hostname + ansible.builtin.set_fact: + tools_kubernetes_topology_mapper_endpoint: http://{{ tools_topology_service_info.resources[0].metadata.name }}.{{ + tools_topology_service_inforesources[0].metadata.namespace }}.svc.cluster.local:8080 + when: + - tools_cluster.platform == "kubernetes" + - tools_topology_service_info is defined + - tools_topology_service_info.resources | length == 1 From f1d428c3f9a80926979c0b038746972bfa52f212 Mon Sep 17 00:00:00 2001 From: Gerard Vanloo Date: Wed, 13 Aug 2025 19:40:34 -0400 Subject: [PATCH 04/35] feat: replace AWX traces cronjob with recorders roles Signed-off-by: Gerard Vanloo --- .../sre-integration-smoke-tests.yaml | 111 ++++++++++++++++++ sre/Makefile | 26 +++- sre/playbooks/manage_incidents.yaml | 8 -- sre/playbooks/manage_recorders.yaml | 69 +++++++++++ .../applications/tasks/install_otel_demo.yaml | 10 +- sre/roles/awx/tasks/configure_jobs.yaml | 47 ++++++-- sre/roles/awx/tasks/configure_workflows.yaml | 22 +++- .../files/kubernetes/alerts/prometheus.yaml | 2 +- .../files/kubernetes/topology/kubernetes.yaml | 2 +- .../files/kubernetes/traces/jaeger.yaml | 71 +++++++++++ .../files/scripts/traces/jaeger/gather.py | 107 +++++++++++++++++ .../scripts/traces/jaeger/requirements.txt | 1 + sre/roles/recorders/tasks/install.yaml | 8 +- .../install_alerts_recorders_prometheus.yaml | 2 +- ...install_topology_recorders_kubernetes.yaml | 4 +- .../tasks/install_traces_recorders.yaml | 6 + .../install_traces_recorders_jaeger.yaml | 49 ++++++++ sre/roles/recorders/tasks/uninstall.yaml | 8 +- .../tasks/uninstall_traces_recorders.yaml | 6 + .../uninstall_traces_recorders_jaeger.yaml | 43 +++++++ .../tools/tasks/set_jaeger_endpoint.yaml | 20 ++++ ..._kubernetes_topology_monitor_endpoint.yaml | 7 +- .../tools/tasks/set_prometheus_endpoint.yaml | 3 +- 23 files changed, 587 insertions(+), 45 deletions(-) create mode 100644 sre/playbooks/manage_recorders.yaml create mode 100644 sre/roles/recorders/files/kubernetes/traces/jaeger.yaml create mode 100644 sre/roles/recorders/files/scripts/traces/jaeger/gather.py create mode 100644 sre/roles/recorders/files/scripts/traces/jaeger/requirements.txt create mode 100644 sre/roles/recorders/tasks/install_traces_recorders.yaml create mode 100644 sre/roles/recorders/tasks/install_traces_recorders_jaeger.yaml create mode 100644 sre/roles/recorders/tasks/uninstall_traces_recorders.yaml create mode 100644 sre/roles/recorders/tasks/uninstall_traces_recorders_jaeger.yaml create mode 100644 sre/roles/tools/tasks/set_jaeger_endpoint.yaml diff --git a/.github/workflows/sre-integration-smoke-tests.yaml b/.github/workflows/sre-integration-smoke-tests.yaml index dc4f09f5..c4797e4d 100644 --- a/.github/workflows/sre-integration-smoke-tests.yaml +++ b/.github/workflows/sre-integration-smoke-tests.yaml @@ -90,6 +90,117 @@ jobs: - name: Run uninstallation smoke test run: | make -C sre undeploy_tools + data-recorder-jaeger: + name: Data Recorder (Jaeger) Smoke Tests + needs: + - jaeger + runs-on: ubuntu-24.04 + steps: + - uses: actions/checkout@v4.2.2 + - uses: actions/setup-python@v5.6.0 + with: + python-version: '3.12' + - uses: actions/setup-go@v5.5.0 + with: + go-version-file: sre/dev/local_cluster/go.mod + cache-dependency-path: sre/dev/local_cluster/go.sum + - uses: azure/setup-helm@v4.3.0 + with: + version: v3.18.3 + - name: Install Python and Ansible dependencies + run: | + pip install -r sre/requirements.txt + ansible-galaxy install -r sre/requirements.yaml + - name: Create Kind cluster + run: | + make -C sre/dev/local_cluster create_cluster + - name: Create group vars + run: | + make -C sre group_vars + echo "tools: { jaeger: true }" > sre/group_vars/environment/tools.yaml + - name: Install tools + run: | + make -C sre deploy_tools + - name: Run installation smoke test + run: | + make -C sre deploy_recorders + - name: Run uninstallation smoke test + run: | + make -C sre undeploy_recorders + data-recorder-kubernetes-topology-monitor: + name: Data Recorder (Kubernetes Topology Monitor) Smoke Tests + needs: + - kubernetes-topology-monitor + runs-on: ubuntu-24.04 + steps: + - uses: actions/checkout@v4.2.2 + - uses: actions/setup-python@v5.6.0 + with: + python-version: '3.12' + - uses: actions/setup-go@v5.5.0 + with: + go-version-file: sre/dev/local_cluster/go.mod + cache-dependency-path: sre/dev/local_cluster/go.sum + - uses: azure/setup-helm@v4.3.0 + with: + version: v3.18.3 + - name: Install Python and Ansible dependencies + run: | + pip install -r sre/requirements.txt + ansible-galaxy install -r sre/requirements.yaml + - name: Create Kind cluster + run: | + make -C sre/dev/local_cluster create_cluster + - name: Create group vars + run: | + make -C sre group_vars + echo "tools: { kubernetes_topology_monitor: true }" > sre/group_vars/environment/tools.yaml + - name: Install tools + run: | + make -C sre deploy_tools + - name: Run installation smoke test + run: | + make -C sre deploy_recorders + - name: Run uninstallation smoke test + run: | + make -C sre undeploy_recorders + data-recorder-prometheus: + name: Data Recorder (Prometheus) Smoke Tests + needs: + - prometheus + runs-on: ubuntu-24.04 + steps: + - uses: actions/checkout@v4.2.2 + - uses: actions/setup-python@v5.6.0 + with: + python-version: '3.12' + - uses: actions/setup-go@v5.5.0 + with: + go-version-file: sre/dev/local_cluster/go.mod + cache-dependency-path: sre/dev/local_cluster/go.sum + - uses: azure/setup-helm@v4.3.0 + with: + version: v3.18.3 + - name: Install Python and Ansible dependencies + run: | + pip install -r sre/requirements.txt + ansible-galaxy install -r sre/requirements.yaml + - name: Create Kind cluster + run: | + make -C sre/dev/local_cluster create_cluster + - name: Create group vars + run: | + make -C sre group_vars + echo "tools: { prometheus: true }" > sre/group_vars/environment/tools.yaml + - name: Install tools + run: | + make -C sre deploy_tools + - name: Run installation smoke test + run: | + make -C sre deploy_recorders + - name: Run uninstallation smoke test + run: | + make -C sre undeploy_recorders ingress: name: Ingress Smoke Tests runs-on: ubuntu-24.04 diff --git a/sre/Makefile b/sre/Makefile index 2de7e7fb..668277ba 100644 --- a/sre/Makefile +++ b/sre/Makefile @@ -62,10 +62,28 @@ else ansible-playbook -i inventory.yaml playbooks/manage_applications.yaml --tags "uninstall_applications" endif +.PHONY: deploy_recorders +deploy_recorders: ## Deploys the data recorders to cluster +ifdef INCIDENT_NUMBER + ansible-playbook -i inventory.yaml playbooks/manage_recorders.yaml --tags "install_recorders" \ + --extra-vars "incident_id=$(INCIDENT_NUMBER)" +else + ansible-playbook -i inventory.yaml playbooks/manage_recorders.yaml --tags "install_recorders" +endif + +.PHONY: undeploy_recorders +undeploy_recorders: ## Undeploys the data recorders to cluster +ifdef INCIDENT_NUMBER + ansible-playbook -i inventory.yaml playbooks/manage_recorders.yaml --tags "uninstall_recorders" \ + --extra-vars "incident_id=$(INCIDENT_NUMBER)" +else + ansible-playbook -i inventory.yaml playbooks/manage_recorders.yaml --tags "uninstall_recorders" +endif + .PHONY: inject_incident_fault inject_incident_fault: ## Injects the fault used in a specific incident ifdef INCIDENT_NUMBER - ansible-playbook -i inventory.yaml playbooks/manage_incidents.yaml --tags "inject_faults,install_recorders" \ + ansible-playbook -i inventory.yaml playbooks/manage_incidents.yaml --tags "inject_faults" \ --extra-vars "incident_id=$(INCIDENT_NUMBER)" else @echo "Missing INCIDENT_NUMBER argument. Please run this command with this variable." @@ -74,7 +92,7 @@ endif .PHONY: remove_incident_fault remove_incident_fault: ## Removes the fault used in a specific incident ifdef INCIDENT_NUMBER - ansible-playbook -i inventory.yaml playbooks/manage_incidents.yaml --tags "remove_faults,uninstall_recorders" \ + ansible-playbook -i inventory.yaml playbooks/manage_incidents.yaml --tags "remove_faults" \ --extra-vars "incident_id=$(INCIDENT_NUMBER)" else @echo "Missing INCIDENT_NUMBER argument. Please run this command with this variable." @@ -87,10 +105,10 @@ create_environment: deploy_tools deploy_applications ## Deploys tools and applic destroy_environment: undeploy_applications undeploy_tools ## Undeploys tools and applications to cluster .PHONY: start_incident -start_incident: create_environment inject_incident_fault ## Starts an incident by deploying a stack, application, and fault for an incident +start_incident: create_environment inject_incident_fault deploy_recorders ## Starts an incident by deploying a stack, application, and fault for an incident .PHONY: stop_incident -stop_incident: remove_incident_fault destroy_environment ## Stops an incident by undeploying a stack, application, and fault for an incident +stop_incident: undeploy_recorders remove_incident_fault destroy_environment ## Stops an incident by undeploying a stack, application, and fault for an incident .PHONY: deploy_awx_stack deploy_awx_stack: ## Deploys AWX to a cluster diff --git a/sre/playbooks/manage_incidents.yaml b/sre/playbooks/manage_incidents.yaml index 950fde5b..44b97ca4 100644 --- a/sre/playbooks/manage_incidents.yaml +++ b/sre/playbooks/manage_incidents.yaml @@ -40,14 +40,6 @@ incidents_file: id: "{{ incident_id }}" tasks: - # - name: Pause for 600 seconds pre-fault removal for alert recording - # ansible.builtin.pause: - # seconds: 600 - # tags: - # - pre_fault_removal - # when: - # - incident.runner != 'local' - - name: Import recorders role ansible.builtin.import_role: name: recorders diff --git a/sre/playbooks/manage_recorders.yaml b/sre/playbooks/manage_recorders.yaml new file mode 100644 index 00000000..fddb2094 --- /dev/null +++ b/sre/playbooks/manage_recorders.yaml @@ -0,0 +1,69 @@ +--- +- name: Manage SRE and FinOps Incident Data Recorder Stack + hosts: + - environment + pre_tasks: + - name: Import system role + ansible.builtin.import_role: + name: system + tags: + - always + vars: + system_cluster: + kubeconfig: "{{ cluster.kubeconfig }}" + + - name: Import cluster role + ansible.builtin.import_role: + name: cluster + tags: + - always + vars: + cluster_files: + kubeconfig: "{{ cluster.kubeconfig }}" + cluster_tools_enabled: + oc: "{{ system_oc_exists }}" + + - name: Import variables for incident + ansible.builtin.import_role: + name: incidents + tasks_from: load + tags: + - always + vars: + incidents_file: + id: "{{ incident_id }}" + when: + - incident_id is defined + + - name: Create tools_enabled dictionary + ansible.builtin.set_fact: + tools_enabled: + chaos_mesh: "{{ tools.chaos_mesh | default(false) }}" + clickhouse: "{{ tools.clickhouse | default(false) }}" + ingress: "{{ tools.ingress | default(false) }}" + jaeger: "{{ tools.jaeger | default(false) }}" + kubernetes_metrics_server: "{{ tools.kubernetes_metrics_server | default(false) }}" + kubernetes_topology_monitor: "{{ tools.kubernetes_topology_monitor | default(false) }}" + opencost: "{{ tools.opencost | default(false) }}" + opensearch: "{{ tools.opensearch | default(false) }}" + opentelemetry: "{{ tools.opentelemetry | default(false) }}" + prometheus: "{{ tools.prometheus | default(false) }}" + tags: + - always + when: + - incident_id is undefined + tasks: + - name: Import recorders role + ansible.builtin.import_role: + name: recorders + vars: + recorders_cluster: + kubeconfig: "{{ cluster.kubeconfig }}" + platform: "{{ cluster_platform }}" + recorders_enabled: + alerts: + prometheus: "{{ tools_enabled.prometheus }}" + topology: + kubernetes: "{{ tools_enabled.kubernetes_topology_monitor }}" + traces: + jaeger: "{{ tools_enabled.jaeger }}" diff --git a/sre/roles/applications/tasks/install_otel_demo.yaml b/sre/roles/applications/tasks/install_otel_demo.yaml index 48deb9ef..4c3faa3b 100644 --- a/sre/roles/applications/tasks/install_otel_demo.yaml +++ b/sre/roles/applications/tasks/install_otel_demo.yaml @@ -50,6 +50,14 @@ tools_cluster: kubeconfig: "{{ applications_cluster.kubeconfig }}" +- name: Import tools role for variable setting tasks + ansible.builtin.import_role: + name: tools + tasks_from: set_jaeger_endpoint + vars: + tools_cluster: + kubeconfig: "{{ applications_cluster.kubeconfig }}" + - name: Install OpenTelemetry Demo (Astronomy Shop) kubernetes.core.helm: chart_ref: opentelemetry-demo @@ -286,7 +294,7 @@ http: endpoint: http://{{ helm_releases.opensearch.name }}-master.{{ helm_releases.opensearch.namespace }}.svc.cluster.local:9200 otlp: - endpoint: http://jaeger-collector.{{ helm_releases.collectors.namespace }}.svc.cluster.local:4317 + endpoint: "{{ tools_jaeger_collector_otlp_endpoint }}" otlphttp/prometheus: endpoint: http://{{ helm_releases.prometheus.name }}-kube-prometheus-prometheus.{{ helm_releases.prometheus.namespace }}.svc.cluster.local:9090 prometheus: diff --git a/sre/roles/awx/tasks/configure_jobs.yaml b/sre/roles/awx/tasks/configure_jobs.yaml index 4eb7eb5c..2582fd82 100644 --- a/sre/roles/awx/tasks/configure_jobs.yaml +++ b/sre/roles/awx/tasks/configure_jobs.yaml @@ -127,13 +127,27 @@ project: GitHub-ITBench state: present -# TODO: Complete the refactoring of the following code: -# -# Telemetry Access code will be added directly to the Applications Role -# Recording code will be added directory to the Applications Role -# Post fault injection and pre fault removal will move to a Job +- name: Add job template for installing data recorders + awx.awx.job_template: + controller_host: "{{ awx_controller_host }}" + controller_password: "{{ awx_controller_password }}" # pragma: allowlist secret + controller_username: admin + credentials: + - Cluster-{{ incident_index + 1 }}-Kubeconfig + - AWS + execution_environment: AWX-EE-Custom + extra_vars: + incident_id: "{{ incident }}" + inventory: IT-Bench-SRE + job_tags: inject_faults + job_type: run + name: "Incident {{ incident }}: Install Data Recorders" + organization: ITBench-Scenarios + playbook: sre/playbooks/manage_recorders.yaml + project: GitHub-ITBench + state: present -- name: Creating/removing job template to setup for telemetry access +- name: Add job template for uninstalling data recorders awx.awx.job_template: controller_host: "{{ awx_controller_host }}" controller_password: "{{ awx_controller_password }}" # pragma: allowlist secret @@ -143,16 +157,21 @@ - AWS execution_environment: AWX-EE-Custom extra_vars: - sample_application: otel_astronomy_shop - job_tags: telemetry_access + incident_id: "{{ incident }}" + inventory: IT-Bench-SRE + job_tags: remove_faults job_type: run - name: "Scenario-{{ incident }}--Setup-For-Telemetry-Access" + name: "Incident {{ incident }}: Uninstall Data Recorders" organization: ITBench-Scenarios - playbook: sre/base.yaml + playbook: sre/playbooks/manage_recorders.yaml project: GitHub-ITBench state: present -- name: Creating/removing job template for trace gathering +# TODO: Complete the refactoring of the following code: +# +# Telemetry Access code will be added directly to the Applications Role + +- name: Creating/removing job template to setup for telemetry access awx.awx.job_template: controller_host: "{{ awx_controller_host }}" controller_password: "{{ awx_controller_password }}" # pragma: allowlist secret @@ -161,11 +180,13 @@ - Cluster-{{ incident_index + 1 }}-Kubeconfig - AWS execution_environment: AWX-EE-Custom + extra_vars: + sample_application: otel_astronomy_shop job_tags: telemetry_access job_type: run - name: "Scenario-{{ incident }}--Trace-Gathering" + name: "Scenario-{{ incident }}--Setup-For-Telemetry-Access" organization: ITBench-Scenarios - playbook: sre/cron_jobs/trace_recorder.yaml + playbook: sre/base.yaml project: GitHub-ITBench state: present diff --git a/sre/roles/awx/tasks/configure_workflows.yaml b/sre/roles/awx/tasks/configure_workflows.yaml index d5789267..ec9a6d4a 100644 --- a/sre/roles/awx/tasks/configure_workflows.yaml +++ b/sre/roles/awx/tasks/configure_workflows.yaml @@ -42,22 +42,23 @@ type: job_template related: success_nodes: - - identifier: node-trace-gathering + - identifier: node-trigger-fault failure_nodes: - identifier: node-capture-failed-deployment - - identifier: node-trace-gathering + always_nodes: [] + - identifier: node-trigger-fault unified_job_template: - name: "Scenario-{{ incident }}--Trace-Gathering" + name: "Incident {{ incident }}: Inject Faults" type: job_template related: success_nodes: - - identifier: node-trigger-fault + - identifier: node-install-recorders failure_nodes: - identifier: node-capture-failed-deployment always_nodes: [] - - identifier: node-trigger-fault + - identifier: node-install-recorders unified_job_template: - name: "Incident {{ incident }}: Inject Faults" + name: "Incident {{ incident }}: Install Data Recorders" type: job_template related: success_nodes: @@ -92,6 +93,15 @@ name: "Stop Incident {{ incident }}" state: present workflow_nodes: + - identifier: node-uninstall-recorders + unified_job_template: + name: "Incident {{ incident }}: Uninstall Data Recorders" + type: job_template + related: + success_nodes: [] + failure_nodes: [] + always_nodes: + - identifier: node-remove-fault - identifier: node-remove-fault unified_job_template: name: "Incident {{ incident }}: Remove Faults" diff --git a/sre/roles/recorders/files/kubernetes/alerts/prometheus.yaml b/sre/roles/recorders/files/kubernetes/alerts/prometheus.yaml index b6f2ad8e..7613c1fd 100644 --- a/sre/roles/recorders/files/kubernetes/alerts/prometheus.yaml +++ b/sre/roles/recorders/files/kubernetes/alerts/prometheus.yaml @@ -68,4 +68,4 @@ spec: requests: storage: 1Gi persistentVolumeClaimRetentionPolicy: - whenDeleted: Delete + whenDeleted: Delete diff --git a/sre/roles/recorders/files/kubernetes/topology/kubernetes.yaml b/sre/roles/recorders/files/kubernetes/topology/kubernetes.yaml index 65f7cb63..0bd3fd05 100644 --- a/sre/roles/recorders/files/kubernetes/topology/kubernetes.yaml +++ b/sre/roles/recorders/files/kubernetes/topology/kubernetes.yaml @@ -68,4 +68,4 @@ spec: requests: storage: 1Gi persistentVolumeClaimRetentionPolicy: - whenDeleted: Delete + whenDeleted: Delete diff --git a/sre/roles/recorders/files/kubernetes/traces/jaeger.yaml b/sre/roles/recorders/files/kubernetes/traces/jaeger.yaml new file mode 100644 index 00000000..f92df936 --- /dev/null +++ b/sre/roles/recorders/files/kubernetes/traces/jaeger.yaml @@ -0,0 +1,71 @@ +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + labels: + app.kubernetes.io/name: jaeger-traces-recorder + app.kubernetes.io/part-of: it-bench + name: jaeger-traces-recorder +spec: + selector: + matchLabels: + app.kubernetes.io/name: jaeger-traces-recorder + app.kubernetes.io/part-of: it-bench + template: + metadata: + annotations: + openshift.io/required-scc: restricted-v2 + labels: + app.kubernetes.io/name: jaeger-traces-recorder + app.kubernetes.io/part-of: it-bench + spec: + containers: + - name: recorder + image: registry.access.redhat.com/ubi9/python-312:9.6-1754326132 + command: + - /bin/sh + args: + - -c + - "python3.12 -m pip install -r ~/deps/requirements.txt && python3.12 ~/scripts/gather.py" + resources: + requests: + cpu: 100m + memory: 125Mi + limits: + memory: 250Mi + volumeMounts: + - name: dependencies + mountPath: /opt/app-root/src/deps + readOnly: true + - name: scripts + mountPath: /opt/app-root/src/scripts + readOnly: true + - name: jaeger-trace-records + mountPath: /opt/app-root/src/records + securityContext: + fsGroup: 1001 + volumes: + - name: scripts + configMap: + name: traces-recorder-jaeger-scripts + items: + - key: script + path: gather.py + - name: dependencies + configMap: + name: traces-recorder-jaeger-scripts + items: + - key: deps + path: requirements.txt + replicas: 1 + volumeClaimTemplates: + - metadata: + name: jaeger-trace-records + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 1Gi + persistentVolumeClaimRetentionPolicy: + whenDeleted: Delete diff --git a/sre/roles/recorders/files/scripts/traces/jaeger/gather.py b/sre/roles/recorders/files/scripts/traces/jaeger/gather.py new file mode 100644 index 00000000..336005d1 --- /dev/null +++ b/sre/roles/recorders/files/scripts/traces/jaeger/gather.py @@ -0,0 +1,107 @@ +import datetime +import json +import logging +import os +import sys +import time + +from datetime import datetime, timedelta, timezone + +import requests + +from requests.adapters import HTTPAdapter +from urllib3.util import Retry + +# Logging +logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) + +logger = logging.getLogger(__name__) + + +def main(): + endpoint = os.environ.get("JAEGER_ENDPOINT") + if endpoint is None: + sys.exit("error: JAEGER_ENDPOINT environment variable is not set") + + headers = { "Content-Type": "application/json" } + + retries = Retry(total=3, backoff_factor=0.1) + adapter = HTTPAdapter(max_retries=retries) + + session = requests.Session() + session.mount("http://", adapter) + session.mount("https://", adapter) + + while True: + end_time = int(time.time_ns() // 1000) + start_time = end_time - (60 * 1_000_000) + + next_datetime = datetime.now() + timedelta(seconds=60) + + response = session.get( + "{0}/api/services".format(endpoint), + headers=headers, + verify=True + ) + + if response.status_code != 200: + logger.warning("unable to query jaeger for services") + else: + content = response.json() + services = content.get("data", []) + + logger.info("retrieved {0} services from jaeger".format(len(services))) + + for service in services: + response = session.get( + "{0}/api/operations".format(endpoint), + headers=headers, + params={ + 'service': service, + }, + verify=True + ) + + if response.status_code != 200: + logger.warning("unable to query jaeger for operations for service ({0})".format(service)) + continue + + content = response.json() + operations = content.get("data", []) + + logger.info("retrieved {0} operations from jaeger".format(len(operations))) + + for operation in operations: + response = session.get( + "{0}/api/traces".format(endpoint), + headers=headers, + params={ + 'service': service, + "operation": operation.get("name", ""), + "start": start_time, + "end": end_time, + "limit": 1 + }, + verify=True + ) + + if response.status_code != 200: + logger.warning("unable to query jaeger for traces for service ({0})".format(operation.get("name", ""))) + continue + + content = response.json() + traces.extend(content.get("data", [])) + + utc_seconds = (datetime.now(timezone.utc) - datetime(1970, 1, 1, tzinfo=timezone.utc)).total_seconds() + file_path = os.path.join(os.path.expanduser("~"), "records", "{0}-traces.json".format(round(utc_seconds))) + + with open(file_path, "w") as f: + json.dump(traces, f, indent=4) + + sleep_interval = (next_datetime - datetime.now()).total_seconds() + if sleep_interval > 0: + logger.debug("sleep for {0} seconds".format(sleep_interval)) + time.sleep(sleep_interval) + +if __name__ == "__main__": + main() diff --git a/sre/roles/recorders/files/scripts/traces/jaeger/requirements.txt b/sre/roles/recorders/files/scripts/traces/jaeger/requirements.txt new file mode 100644 index 00000000..2c24336e --- /dev/null +++ b/sre/roles/recorders/files/scripts/traces/jaeger/requirements.txt @@ -0,0 +1 @@ +requests==2.31.0 diff --git a/sre/roles/recorders/tasks/install.yaml b/sre/roles/recorders/tasks/install.yaml index 52ffd8c6..9362ad68 100644 --- a/sre/roles/recorders/tasks/install.yaml +++ b/sre/roles/recorders/tasks/install.yaml @@ -11,7 +11,7 @@ - name: Import alert recorder installation tasks ansible.builtin.import_tasks: - file: install_alert_recorders.yaml + file: install_alerts_recorders.yaml when: - recorders_enabled.alerts is defined @@ -20,3 +20,9 @@ file: install_topology_recorders.yaml when: - recorders_enabled.topology is defined + +- name: Import traces recorder installation tasks + ansible.builtin.import_tasks: + file: install_traces_recorders.yaml + when: + - recorders_enabled.traces is defined diff --git a/sre/roles/recorders/tasks/install_alerts_recorders_prometheus.yaml b/sre/roles/recorders/tasks/install_alerts_recorders_prometheus.yaml index 8612ad79..69f6aba0 100644 --- a/sre/roles/recorders/tasks/install_alerts_recorders_prometheus.yaml +++ b/sre/roles/recorders/tasks/install_alerts_recorders_prometheus.yaml @@ -60,7 +60,7 @@ 'name': PROMETHEUS_TOKEN, 'valueFrom': { 'secretKeyRef': { - 'name': 'alerts-recorder-prometheus-token' + 'name': 'alerts-recorder-prometheus-token', 'key': 'token' } } diff --git a/sre/roles/recorders/tasks/install_topology_recorders_kubernetes.yaml b/sre/roles/recorders/tasks/install_topology_recorders_kubernetes.yaml index aa613b5a..9532de48 100644 --- a/sre/roles/recorders/tasks/install_topology_recorders_kubernetes.yaml +++ b/sre/roles/recorders/tasks/install_topology_recorders_kubernetes.yaml @@ -18,8 +18,8 @@ name: topology-recorder-kubernetes-scripts namespace: "{{ recorders_namespace.name }}" data: - deps: "{{ lookup('ansible.builtin.file', 'files/scripts/alerts/prometheus/requirements.txt') }}" - script: "{{ lookup('ansible.builtin.file', 'files/scripts/alerts/prometheus/gather.py') }}" + deps: "{{ lookup('ansible.builtin.file', 'files/scripts/topology/kubernetes/requirements.txt') }}" + script: "{{ lookup('ansible.builtin.file', 'files/scripts/topology/kubernetes/gather.py') }}" state: present - name: Install Kubernetes Topology Recorder diff --git a/sre/roles/recorders/tasks/install_traces_recorders.yaml b/sre/roles/recorders/tasks/install_traces_recorders.yaml new file mode 100644 index 00000000..74ea1384 --- /dev/null +++ b/sre/roles/recorders/tasks/install_traces_recorders.yaml @@ -0,0 +1,6 @@ +--- +- name: Import Jaeger Traces Recorder installation tasks + ansible.builtin.import_tasks: + file: install_traces_recorders_jaeger.yaml + when: + - recorders_enabled.traces.jaeger | default(true) diff --git a/sre/roles/recorders/tasks/install_traces_recorders_jaeger.yaml b/sre/roles/recorders/tasks/install_traces_recorders_jaeger.yaml new file mode 100644 index 00000000..a553b147 --- /dev/null +++ b/sre/roles/recorders/tasks/install_traces_recorders_jaeger.yaml @@ -0,0 +1,49 @@ +--- +- name: Import tools role for variable setting tasks + ansible.builtin.import_role: + name: tools + tasks_from: set_jaeger_endpoint + vars: + tools_cluster: + kubeconfig: "{{ recorders_cluster.kubeconfig }}" + platform: "{{ recorders_cluster.platform }}" + +- name: Create ConfigMap with Python script + kubernetes.core.k8s: + kubeconfig: "{{ recorders_cluster.kubeconfig }}" + resource_definition: + apiVersion: v1 + kind: ConfigMap + metadata: + name: traces-recorder-jaeger-scripts + namespace: "{{ recorders_namespace.name }}" + data: + deps: "{{ lookup('ansible.builtin.file', 'files/scripts/traces/jaeger/requirements.txt') }}" + script: "{{ lookup('ansible.builtin.file', 'files/scripts/traces/jaeger/gather.py') }}" + state: present + +- name: Install Jaeger Traces Recorder + kubernetes.core.k8s: + kubeconfig: "{{ recorders_cluster.kubeconfig }}" + namespace: "{{ recorders_namespace.name }}" + src: files/kubernetes/traces/jaeger.yaml + state: present + +- name: Update Jaeger Traces environment variables + kubernetes.core.k8s: + kubeconfig: "{{ recorders_cluster.kubeconfig }}" + resource_definition: + apiVersion: apps/v1 + kind: StatefulSet + metadata: + name: jaeger-traces-recorder + namespace: "{{ recorders_namespace.name }}" + spec: + template: + spec: + containers: + - name: recorder + env: + - name: JAEGER_ENDPOINT + value: "{{ tools_jaeger_querier_endpoint }}" + state: patched diff --git a/sre/roles/recorders/tasks/uninstall.yaml b/sre/roles/recorders/tasks/uninstall.yaml index ae325990..b4d1b814 100644 --- a/sre/roles/recorders/tasks/uninstall.yaml +++ b/sre/roles/recorders/tasks/uninstall.yaml @@ -1,7 +1,7 @@ --- - name: Import alert recorder uninstallation tasks ansible.builtin.import_tasks: - file: uninstall_alert_recorders.yaml + file: uninstall_alerts_recorders.yaml when: - recorders_enabled.alerts is defined @@ -11,6 +11,12 @@ when: - recorders_enabled.topology is defined +- name: Import traces recorder uninstallation tasks + ansible.builtin.import_tasks: + file: uninstall_traces_recorders.yaml + when: + - recorders_enabled.traces is defined + - name: Delete the namespace kubernetes.core.k8s: kubeconfig: "{{ recorders_cluster.kubeconfig }}" diff --git a/sre/roles/recorders/tasks/uninstall_traces_recorders.yaml b/sre/roles/recorders/tasks/uninstall_traces_recorders.yaml new file mode 100644 index 00000000..549855b5 --- /dev/null +++ b/sre/roles/recorders/tasks/uninstall_traces_recorders.yaml @@ -0,0 +1,6 @@ +--- +- name: Import Jaeger Traces Recorder uninstallation tasks + ansible.builtin.import_tasks: + file: uninstall_traces_recorders_jaeger.yaml + when: + - recorders_enabled.traces.jaeger | default(true) diff --git a/sre/roles/recorders/tasks/uninstall_traces_recorders_jaeger.yaml b/sre/roles/recorders/tasks/uninstall_traces_recorders_jaeger.yaml new file mode 100644 index 00000000..578105db --- /dev/null +++ b/sre/roles/recorders/tasks/uninstall_traces_recorders_jaeger.yaml @@ -0,0 +1,43 @@ +--- +- name: Retrieve the topology recorder pod name + kubernetes.core.k8s_info: + api_version: v1 + kind: Pod + kubeconfig: "{{ recorders_cluster.kubeconfig }}" + namespace: "{{ recorders_namespace.name }}" + label_selectors: + - app.kubernetes.io/name = jaeger-traces-recorder + - app.kubernetes.io/part-of = it-bench + register: recorders_pods_info + +- name: Copy records directory from pod + kubernetes.core.k8s_cp: + kubeconfig: "{{ recorders_cluster.kubeconfig }}" + local_path: /tmp/traces + namespace: "{{ recorders_pods_info.resources[0].metadata.namespace }}" + pod: "{{ recorders_pods_info.resources[0].metadata.name }}" + remote_path: /opt/app-root/src/records + state: from_pod + when: + - recorders_pods_info is defined + - recorders_pods_info.resources | length == 1 + +- name: Uninstall Jaeger Traces Recorder + kubernetes.core.k8s: + kubeconfig: "{{ recorders_cluster.kubeconfig }}" + namespace: "{{ recorders_namespace.name }}" + src: files/kubernetes/traces/jaeger.yaml + state: absent + wait: true + +- name: Delete ConfigMap with Python script + kubernetes.core.k8s: + kubeconfig: "{{ recorders_cluster.kubeconfig }}" + resource_definition: + apiVersion: v1 + kind: ConfigMap + metadata: + name: traces-recorder-jaeger-scripts + namespace: "{{ recorders_namespace.name }}" + state: absent + wait: true diff --git a/sre/roles/tools/tasks/set_jaeger_endpoint.yaml b/sre/roles/tools/tasks/set_jaeger_endpoint.yaml new file mode 100644 index 00000000..089f32bc --- /dev/null +++ b/sre/roles/tools/tasks/set_jaeger_endpoint.yaml @@ -0,0 +1,20 @@ +--- +- name: Retrieve Jaeger service info + kubernetes.core.k8s_info: + api_version: v1 + kind: Service + kubeconfig: "{{ tools_cluster.kubeconfig }}" + name: jaeger-collector + namespace: "{{ tools_helm_releases.opentelemetry_collectors.namespace }}" + register: tools_jaeger_service_info + +- name: Extract Prometheus hostname + ansible.builtin.set_fact: + tools_jaeger_collector_otlp_endpoint: http://{{ tools_jaeger_service_info.resources[0].metadata.name }}.{{ + tools_jaeger_service_info.resources[0].metadata.namespace }}.svc.cluster.local:4317 + tools_jaeger_querier_endpoint: http://{{ tools_jaeger_service_info.resources[0].metadata.name }}.{{ + tools_jaeger_service_info.resources[0].metadata.namespace }}.svc.cluster.local:16686 + when: + - tools_cluster.platform == "kubernetes" + - tools_jaeger_service_info is defined + - tools_jaeger_service_info.resources | length == 1 diff --git a/sre/roles/tools/tasks/set_kubernetes_topology_monitor_endpoint.yaml b/sre/roles/tools/tasks/set_kubernetes_topology_monitor_endpoint.yaml index 77a0782c..7100b93a 100644 --- a/sre/roles/tools/tasks/set_kubernetes_topology_monitor_endpoint.yaml +++ b/sre/roles/tools/tasks/set_kubernetes_topology_monitor_endpoint.yaml @@ -1,18 +1,17 @@ --- -- name: Retrieve Prometheus service info +- name: Retrieve Kubernetes Topology Service service info kubernetes.core.k8s_info: api_version: v1 kind: Service kubeconfig: "{{ tools_cluster.kubeconfig }}" + name: topology-monitor namespace: "{{ tools_helm_releases.kubernetes_topology_monitor.namespace }}" - label_selectors: - - app = topology-monitor register: tools_topology_service_info - name: Extract Prometheus hostname ansible.builtin.set_fact: tools_kubernetes_topology_mapper_endpoint: http://{{ tools_topology_service_info.resources[0].metadata.name }}.{{ - tools_topology_service_inforesources[0].metadata.namespace }}.svc.cluster.local:8080 + tools_topology_service_info.resources[0].metadata.namespace }}.svc.cluster.local:8080 when: - tools_cluster.platform == "kubernetes" - tools_topology_service_info is defined diff --git a/sre/roles/tools/tasks/set_prometheus_endpoint.yaml b/sre/roles/tools/tasks/set_prometheus_endpoint.yaml index 6badc882..3404a43f 100644 --- a/sre/roles/tools/tasks/set_prometheus_endpoint.yaml +++ b/sre/roles/tools/tasks/set_prometheus_endpoint.yaml @@ -36,9 +36,8 @@ api_version: v1 kind: Service kubeconfig: "{{ tools_cluster.kubeconfig }}" + name: "{{ tools_helm_releases.prometheus.name }}-kube-prometheus-prometheus" namespace: "{{ tools_helm_releases.prometheus.namespace }}" - label_selectors: - - app = kube-prometheus-stack-prometheus register: tools_prometheus_service_info when: - tools_cluster.platform == "kubernetes" From 13d1581640cf426d17f6aa467847d8ac95bf1a98 Mon Sep 17 00:00:00 2001 From: Gerard Vanloo Date: Wed, 13 Aug 2025 20:25:08 -0400 Subject: [PATCH 05/35] chore: cleaning files Signed-off-by: Gerard Vanloo --- .../files/scripts/traces/jaeger/gather.py | 137 ++++++++++-------- .../uninstall_traces_recorders_jaeger.yaml | 22 +-- .../tools/tasks/set_clickhouse_endpoint.yaml | 3 +- .../tools/tasks/set_jaeger_endpoint.yaml | 3 +- ..._kubernetes_topology_monitor_endpoint.yaml | 3 +- .../tools/tasks/set_prometheus_endpoint.yaml | 2 +- 6 files changed, 93 insertions(+), 77 deletions(-) diff --git a/sre/roles/recorders/files/scripts/traces/jaeger/gather.py b/sre/roles/recorders/files/scripts/traces/jaeger/gather.py index 336005d1..5aaf9323 100644 --- a/sre/roles/recorders/files/scripts/traces/jaeger/gather.py +++ b/sre/roles/recorders/files/scripts/traces/jaeger/gather.py @@ -17,6 +17,62 @@ logger = logging.getLogger(__name__) +def get_services(session, endpoint, headers): + response = session.get( + "{0}/api/services".format(endpoint), + headers=headers, + verify=True + ) + + if response.status_code != 200: + logger.warning("unable to query jaeger for services") + return [] + + content = response.json() + return content.get("data", []) + +def get_operations(session, endpoint, headers, service): + response = session.get( + "{0}/api/operations".format(endpoint), + headers=headers, + params={ + 'service': service, + }, + verify=True + ) + + if response.status_code != 200: + logger.warning("unable to query jaeger for operations related to {0}".format(service)) + return [] + + content = response.json() + return content.get("data", []) + +def get_traces(session, endpoint, headers, service, operation, time_window): + name = operation.get("name") + if name is None: + logger.warning("unable to discover name of operation: {0}".format(operation)) + return [] + + response = session.get( + "{0}/api/traces".format(endpoint), + headers=headers, + params={ + 'service': service, + "operation": name, + "start": time_window[0], + "end": time_window[1], + "limit": 1 + }, + verify=True + ) + + if response.status_code != 200: + logger.warning("unable to query jaeger for traces related to operation ({0})".format(name)) + return [] + + content = response.json() + return content.get("data", []) def main(): endpoint = os.environ.get("JAEGER_ENDPOINT") @@ -33,70 +89,31 @@ def main(): session.mount("https://", adapter) while True: + next_datetime = datetime.now() + timedelta(seconds=60) + + traces = [] + end_time = int(time.time_ns() // 1000) start_time = end_time - (60 * 1_000_000) - next_datetime = datetime.now() + timedelta(seconds=60) + services = get_services(session, endpoint, headers) + logger.info("retrieved {0} services from jaeger".format(len(services))) + + for service in services: + operations = get_operations(session, endpoint, headers, service) + logger.info("retrieved {0} operations from jaeger".format(len(operations))) + + for operation in operations: + t = get_traces(session, endpoint, headers, service, operation, (start_time, end_time)) + logger.info("retrieved {0} traces from jaeger".format(len(ts))) + + traces.extend(t) + + utc_seconds = (datetime.now(timezone.utc) - datetime(1970, 1, 1, tzinfo=timezone.utc)).total_seconds() + file_path = os.path.join(os.path.expanduser("~"), "records", "{0}-traces.json".format(round(utc_seconds))) - response = session.get( - "{0}/api/services".format(endpoint), - headers=headers, - verify=True - ) - - if response.status_code != 200: - logger.warning("unable to query jaeger for services") - else: - content = response.json() - services = content.get("data", []) - - logger.info("retrieved {0} services from jaeger".format(len(services))) - - for service in services: - response = session.get( - "{0}/api/operations".format(endpoint), - headers=headers, - params={ - 'service': service, - }, - verify=True - ) - - if response.status_code != 200: - logger.warning("unable to query jaeger for operations for service ({0})".format(service)) - continue - - content = response.json() - operations = content.get("data", []) - - logger.info("retrieved {0} operations from jaeger".format(len(operations))) - - for operation in operations: - response = session.get( - "{0}/api/traces".format(endpoint), - headers=headers, - params={ - 'service': service, - "operation": operation.get("name", ""), - "start": start_time, - "end": end_time, - "limit": 1 - }, - verify=True - ) - - if response.status_code != 200: - logger.warning("unable to query jaeger for traces for service ({0})".format(operation.get("name", ""))) - continue - - content = response.json() - traces.extend(content.get("data", [])) - - utc_seconds = (datetime.now(timezone.utc) - datetime(1970, 1, 1, tzinfo=timezone.utc)).total_seconds() - file_path = os.path.join(os.path.expanduser("~"), "records", "{0}-traces.json".format(round(utc_seconds))) - - with open(file_path, "w") as f: - json.dump(traces, f, indent=4) + with open(file_path, "w") as f: + json.dump(traces, f, indent=4) sleep_interval = (next_datetime - datetime.now()).total_seconds() if sleep_interval > 0: diff --git a/sre/roles/recorders/tasks/uninstall_traces_recorders_jaeger.yaml b/sre/roles/recorders/tasks/uninstall_traces_recorders_jaeger.yaml index 578105db..176be4a8 100644 --- a/sre/roles/recorders/tasks/uninstall_traces_recorders_jaeger.yaml +++ b/sre/roles/recorders/tasks/uninstall_traces_recorders_jaeger.yaml @@ -10,17 +10,17 @@ - app.kubernetes.io/part-of = it-bench register: recorders_pods_info -- name: Copy records directory from pod - kubernetes.core.k8s_cp: - kubeconfig: "{{ recorders_cluster.kubeconfig }}" - local_path: /tmp/traces - namespace: "{{ recorders_pods_info.resources[0].metadata.namespace }}" - pod: "{{ recorders_pods_info.resources[0].metadata.name }}" - remote_path: /opt/app-root/src/records - state: from_pod - when: - - recorders_pods_info is defined - - recorders_pods_info.resources | length == 1 +# - name: Copy records directory from pod +# kubernetes.core.k8s_cp: +# kubeconfig: "{{ recorders_cluster.kubeconfig }}" +# local_path: /tmp/traces +# namespace: "{{ recorders_pods_info.resources[0].metadata.namespace }}" +# pod: "{{ recorders_pods_info.resources[0].metadata.name }}" +# remote_path: /opt/app-root/src/records +# state: from_pod +# when: +# - recorders_pods_info is defined +# - recorders_pods_info.resources | length == 1 - name: Uninstall Jaeger Traces Recorder kubernetes.core.k8s: diff --git a/sre/roles/tools/tasks/set_clickhouse_endpoint.yaml b/sre/roles/tools/tasks/set_clickhouse_endpoint.yaml index 3e43846c..469ba9af 100644 --- a/sre/roles/tools/tasks/set_clickhouse_endpoint.yaml +++ b/sre/roles/tools/tasks/set_clickhouse_endpoint.yaml @@ -9,8 +9,9 @@ wait: true register: tools_clickhouse_installation_info -- name: Extract endpoint for Clickhouse cluster +- name: Extract in-cluster endpoint for Clickhouse cluster ansible.builtin.set_fact: tools_clickhouse_endpoint: http://{{ tools_clickhouse_installation_info.resources[0].status.endpoint }}:8123 when: + - tools_clickhouse_installation_info is defined - tools_clickhouse_installation_info.resources | length > 0 diff --git a/sre/roles/tools/tasks/set_jaeger_endpoint.yaml b/sre/roles/tools/tasks/set_jaeger_endpoint.yaml index 089f32bc..b88b97db 100644 --- a/sre/roles/tools/tasks/set_jaeger_endpoint.yaml +++ b/sre/roles/tools/tasks/set_jaeger_endpoint.yaml @@ -8,13 +8,12 @@ namespace: "{{ tools_helm_releases.opentelemetry_collectors.namespace }}" register: tools_jaeger_service_info -- name: Extract Prometheus hostname +- name: Construct Jaeger in-cluster endpoints ansible.builtin.set_fact: tools_jaeger_collector_otlp_endpoint: http://{{ tools_jaeger_service_info.resources[0].metadata.name }}.{{ tools_jaeger_service_info.resources[0].metadata.namespace }}.svc.cluster.local:4317 tools_jaeger_querier_endpoint: http://{{ tools_jaeger_service_info.resources[0].metadata.name }}.{{ tools_jaeger_service_info.resources[0].metadata.namespace }}.svc.cluster.local:16686 when: - - tools_cluster.platform == "kubernetes" - tools_jaeger_service_info is defined - tools_jaeger_service_info.resources | length == 1 diff --git a/sre/roles/tools/tasks/set_kubernetes_topology_monitor_endpoint.yaml b/sre/roles/tools/tasks/set_kubernetes_topology_monitor_endpoint.yaml index 7100b93a..038f3c49 100644 --- a/sre/roles/tools/tasks/set_kubernetes_topology_monitor_endpoint.yaml +++ b/sre/roles/tools/tasks/set_kubernetes_topology_monitor_endpoint.yaml @@ -8,11 +8,10 @@ namespace: "{{ tools_helm_releases.kubernetes_topology_monitor.namespace }}" register: tools_topology_service_info -- name: Extract Prometheus hostname +- name: Construct Kubernetes Topology in-cluster endpoint ansible.builtin.set_fact: tools_kubernetes_topology_mapper_endpoint: http://{{ tools_topology_service_info.resources[0].metadata.name }}.{{ tools_topology_service_info.resources[0].metadata.namespace }}.svc.cluster.local:8080 when: - - tools_cluster.platform == "kubernetes" - tools_topology_service_info is defined - tools_topology_service_info.resources | length == 1 diff --git a/sre/roles/tools/tasks/set_prometheus_endpoint.yaml b/sre/roles/tools/tasks/set_prometheus_endpoint.yaml index 3404a43f..a04b66bb 100644 --- a/sre/roles/tools/tasks/set_prometheus_endpoint.yaml +++ b/sre/roles/tools/tasks/set_prometheus_endpoint.yaml @@ -42,7 +42,7 @@ when: - tools_cluster.platform == "kubernetes" -- name: Extract Prometheus hostname +- name: Construct Prometheus in-cluster endpoint ansible.builtin.set_fact: tools_prometheus_endpoint: http://{{ tools_prometheus_service_info.resources[0].metadata.name }}.{{ tools_prometheus_service_info.resources[0].metadata.namespace }}.svc.cluster.local:9090 From ac7e6273d1d644a82523106be7346b3abc3e4bfb Mon Sep 17 00:00:00 2001 From: Gerard Vanloo Date: Wed, 13 Aug 2025 21:15:50 -0400 Subject: [PATCH 06/35] fix: wait for statefulset to successfully update Signed-off-by: Gerard Vanloo --- .../files/kubernetes/topology/kubernetes.yaml | 2 +- .../scripts/topology/kubernetes/gather.py | 2 +- .../files/scripts/traces/jaeger/gather.py | 13 ++++++------ .../install_alerts_recorders_prometheus.yaml | 21 +++++++++++++++++++ ...install_topology_recorders_kubernetes.yaml | 18 ++++++++++++++++ .../install_traces_recorders_jaeger.yaml | 18 ++++++++++++++++ 6 files changed, 66 insertions(+), 8 deletions(-) diff --git a/sre/roles/recorders/files/kubernetes/topology/kubernetes.yaml b/sre/roles/recorders/files/kubernetes/topology/kubernetes.yaml index 0bd3fd05..ee11a116 100644 --- a/sre/roles/recorders/files/kubernetes/topology/kubernetes.yaml +++ b/sre/roles/recorders/files/kubernetes/topology/kubernetes.yaml @@ -66,6 +66,6 @@ spec: - ReadWriteOnce resources: requests: - storage: 1Gi + storage: 10Gi persistentVolumeClaimRetentionPolicy: whenDeleted: Delete diff --git a/sre/roles/recorders/files/scripts/topology/kubernetes/gather.py b/sre/roles/recorders/files/scripts/topology/kubernetes/gather.py index 7d70a347..d1d5adca 100644 --- a/sre/roles/recorders/files/scripts/topology/kubernetes/gather.py +++ b/sre/roles/recorders/files/scripts/topology/kubernetes/gather.py @@ -33,7 +33,7 @@ def main(): session.mount("https://", adapter) while True: - next_datetime = datetime.now() + timedelta(seconds=60) + next_datetime = datetime.now() + timedelta(seconds=180) for item in ["nodes", "edges", "graph", "events"]: response = session.get("{0}/{1}".format(endpoint, item), headers=headers, verify=True) diff --git a/sre/roles/recorders/files/scripts/traces/jaeger/gather.py b/sre/roles/recorders/files/scripts/traces/jaeger/gather.py index 5aaf9323..a00ab2cb 100644 --- a/sre/roles/recorders/files/scripts/traces/jaeger/gather.py +++ b/sre/roles/recorders/files/scripts/traces/jaeger/gather.py @@ -89,12 +89,12 @@ def main(): session.mount("https://", adapter) while True: - next_datetime = datetime.now() + timedelta(seconds=60) + next_datetime = datetime.now() + timedelta(seconds=300) traces = [] end_time = int(time.time_ns() // 1000) - start_time = end_time - (60 * 1_000_000) + start_time = end_time - (300 * 1_000_000) services = get_services(session, endpoint, headers) logger.info("retrieved {0} services from jaeger".format(len(services))) @@ -109,11 +109,12 @@ def main(): traces.extend(t) - utc_seconds = (datetime.now(timezone.utc) - datetime(1970, 1, 1, tzinfo=timezone.utc)).total_seconds() - file_path = os.path.join(os.path.expanduser("~"), "records", "{0}-traces.json".format(round(utc_seconds))) + if len(traces) > 0: + utc_seconds = (datetime.now(timezone.utc) - datetime(1970, 1, 1, tzinfo=timezone.utc)).total_seconds() + file_path = os.path.join(os.path.expanduser("~"), "records", "{0}-traces.json".format(round(utc_seconds))) - with open(file_path, "w") as f: - json.dump(traces, f, indent=4) + with open(file_path, "w") as f: + json.dump(traces, f, indent=4) sleep_interval = (next_datetime - datetime.now()).total_seconds() if sleep_interval > 0: diff --git a/sre/roles/recorders/tasks/install_alerts_recorders_prometheus.yaml b/sre/roles/recorders/tasks/install_alerts_recorders_prometheus.yaml index 69f6aba0..70ee67b6 100644 --- a/sre/roles/recorders/tasks/install_alerts_recorders_prometheus.yaml +++ b/sre/roles/recorders/tasks/install_alerts_recorders_prometheus.yaml @@ -50,6 +50,8 @@ recorders_prometheus_env_vars: - name: PROMETHEUS_ENDPOINT value: "{{ tools_prometheus_endpoint }}" + when: + - tools_prometheus_endpoint is defined - name: Add Secret to environment list ansible.builtin.set_fact: @@ -69,6 +71,7 @@ when: - recorders_cluster.platform == "openshift" - tools_prometheus_bearer_token is defined + - recorders_prometheus_env_vars is defined - name: Update Prometheus Alert Recorder environment variables kubernetes.core.k8s: @@ -86,3 +89,21 @@ - name: recorder env: "{{ recorders_prometheus_env_vars }}" state: patched + when: + - recorders_prometheus_env_vars is defined + +- name: Wait for workload to update + kubernetes.core.k8s_info: + api_version: apps/v1 + kind: StatefulSet + kubeconfig: "{{ recorders_cluster.kubeconfig }}" + name: prometheus-alert-recorder + namespace: "{{ recorders_namespace.name }}" + register: recorders_statefulset_info + until: + - recorders_statefulset_info.resources | length > 0 + - recorders_statefulset_info.resources[0].status is defined + - recorders_statefulset_info.resources[0].status.readyReplicas is defined + - recorders_statefulset_info.resources[0].status.readyReplicas == 1 + retries: 8 + delay: 15 diff --git a/sre/roles/recorders/tasks/install_topology_recorders_kubernetes.yaml b/sre/roles/recorders/tasks/install_topology_recorders_kubernetes.yaml index 9532de48..954eef72 100644 --- a/sre/roles/recorders/tasks/install_topology_recorders_kubernetes.yaml +++ b/sre/roles/recorders/tasks/install_topology_recorders_kubernetes.yaml @@ -47,3 +47,21 @@ - name: KUBERNETES_TOPOLOGY_MONITOR_ENDPOINT value: "{{ tools_kubernetes_topology_mapper_endpoint }}" state: patched + when: + - tools_kubernetes_topology_mapper_endpoint is defined + +- name: Wait for workload to update + kubernetes.core.k8s_info: + api_version: apps/v1 + kind: StatefulSet + kubeconfig: "{{ recorders_cluster.kubeconfig }}" + name: kubernetes-topology-recorder + namespace: "{{ recorders_namespace.name }}" + register: recorders_statefulset_info + until: + - recorders_statefulset_info.resources | length > 0 + - recorders_statefulset_info.resources[0].status is defined + - recorders_statefulset_info.resources[0].status.readyReplicas is defined + - recorders_statefulset_info.resources[0].status.readyReplicas == 1 + retries: 8 + delay: 15 diff --git a/sre/roles/recorders/tasks/install_traces_recorders_jaeger.yaml b/sre/roles/recorders/tasks/install_traces_recorders_jaeger.yaml index a553b147..f8592e17 100644 --- a/sre/roles/recorders/tasks/install_traces_recorders_jaeger.yaml +++ b/sre/roles/recorders/tasks/install_traces_recorders_jaeger.yaml @@ -47,3 +47,21 @@ - name: JAEGER_ENDPOINT value: "{{ tools_jaeger_querier_endpoint }}" state: patched + when: + - tools_jaeger_querier_endpoint is defined + +- name: Wait for workload to update + kubernetes.core.k8s_info: + api_version: apps/v1 + kind: StatefulSet + kubeconfig: "{{ recorders_cluster.kubeconfig }}" + name: jaeger-traces-recorder + namespace: "{{ recorders_namespace.name }}" + register: recorders_statefulset_info + until: + - recorders_statefulset_info.resources | length > 0 + - recorders_statefulset_info.resources[0].status is defined + - recorders_statefulset_info.resources[0].status.readyReplicas is defined + - recorders_statefulset_info.resources[0].status.readyReplicas == 1 + retries: 8 + delay: 15 From 435bc97719d1c7ff2d57ce6608964163e082a29b Mon Sep 17 00:00:00 2001 From: Gerard Vanloo Date: Wed, 13 Aug 2025 21:36:01 -0400 Subject: [PATCH 07/35] chore: adjust retry values Signed-off-by: Gerard Vanloo --- .../scripts/topology/kubernetes/gather.py | 2 +- .../files/scripts/traces/jaeger/gather.py | 9 +++++--- .../uninstall_traces_recorders_jaeger.yaml | 22 +++++++++---------- 3 files changed, 18 insertions(+), 15 deletions(-) diff --git a/sre/roles/recorders/files/scripts/topology/kubernetes/gather.py b/sre/roles/recorders/files/scripts/topology/kubernetes/gather.py index d1d5adca..5d1ccfc8 100644 --- a/sre/roles/recorders/files/scripts/topology/kubernetes/gather.py +++ b/sre/roles/recorders/files/scripts/topology/kubernetes/gather.py @@ -25,7 +25,7 @@ def main(): headers = { "Content-Type": "application/json" } - retries = Retry(total=3, backoff_factor=0.1) + retries = Retry(total=3, backoff_factor=0.5) adapter = HTTPAdapter(max_retries=retries) session = requests.Session() diff --git a/sre/roles/recorders/files/scripts/traces/jaeger/gather.py b/sre/roles/recorders/files/scripts/traces/jaeger/gather.py index a00ab2cb..6fd49dfb 100644 --- a/sre/roles/recorders/files/scripts/traces/jaeger/gather.py +++ b/sre/roles/recorders/files/scripts/traces/jaeger/gather.py @@ -36,7 +36,7 @@ def get_operations(session, endpoint, headers, service): "{0}/api/operations".format(endpoint), headers=headers, params={ - 'service': service, + "service": service, }, verify=True ) @@ -58,7 +58,7 @@ def get_traces(session, endpoint, headers, service, operation, time_window): "{0}/api/traces".format(endpoint), headers=headers, params={ - 'service': service, + "service": service, "operation": name, "start": time_window[0], "end": time_window[1], @@ -81,13 +81,16 @@ def main(): headers = { "Content-Type": "application/json" } - retries = Retry(total=3, backoff_factor=0.1) + retries = Retry(total=3, backoff_factor=0.3) adapter = HTTPAdapter(max_retries=retries) session = requests.Session() session.mount("http://", adapter) session.mount("https://", adapter) + logger.info("sleeping for 5 minutes to wait for Jaeger to receive data") + time.sleep(300) + while True: next_datetime = datetime.now() + timedelta(seconds=300) diff --git a/sre/roles/recorders/tasks/uninstall_traces_recorders_jaeger.yaml b/sre/roles/recorders/tasks/uninstall_traces_recorders_jaeger.yaml index 176be4a8..578105db 100644 --- a/sre/roles/recorders/tasks/uninstall_traces_recorders_jaeger.yaml +++ b/sre/roles/recorders/tasks/uninstall_traces_recorders_jaeger.yaml @@ -10,17 +10,17 @@ - app.kubernetes.io/part-of = it-bench register: recorders_pods_info -# - name: Copy records directory from pod -# kubernetes.core.k8s_cp: -# kubeconfig: "{{ recorders_cluster.kubeconfig }}" -# local_path: /tmp/traces -# namespace: "{{ recorders_pods_info.resources[0].metadata.namespace }}" -# pod: "{{ recorders_pods_info.resources[0].metadata.name }}" -# remote_path: /opt/app-root/src/records -# state: from_pod -# when: -# - recorders_pods_info is defined -# - recorders_pods_info.resources | length == 1 +- name: Copy records directory from pod + kubernetes.core.k8s_cp: + kubeconfig: "{{ recorders_cluster.kubeconfig }}" + local_path: /tmp/traces + namespace: "{{ recorders_pods_info.resources[0].metadata.namespace }}" + pod: "{{ recorders_pods_info.resources[0].metadata.name }}" + remote_path: /opt/app-root/src/records + state: from_pod + when: + - recorders_pods_info is defined + - recorders_pods_info.resources | length == 1 - name: Uninstall Jaeger Traces Recorder kubernetes.core.k8s: From 6b13073c91730940d78f6e4e4039926d9b178bb4 Mon Sep 17 00:00:00 2001 From: Gerard Vanloo Date: Thu, 14 Aug 2025 09:15:14 -0400 Subject: [PATCH 08/35] fix: correct conditionals Signed-off-by: Gerard Vanloo --- sre/roles/recorders/tasks/install_topology_recorders.yaml | 2 +- sre/roles/recorders/tasks/uninstall_topology_recorders.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sre/roles/recorders/tasks/install_topology_recorders.yaml b/sre/roles/recorders/tasks/install_topology_recorders.yaml index be1a4c3f..d7496a34 100644 --- a/sre/roles/recorders/tasks/install_topology_recorders.yaml +++ b/sre/roles/recorders/tasks/install_topology_recorders.yaml @@ -3,4 +3,4 @@ ansible.builtin.import_tasks: file: install_topology_recorders_kubernetes.yaml when: - - recorders_enabled.alerts.kubernetes | default(true) + - recorders_enabled.topology.kubernetes | default(true) diff --git a/sre/roles/recorders/tasks/uninstall_topology_recorders.yaml b/sre/roles/recorders/tasks/uninstall_topology_recorders.yaml index 1b3ccf60..6a79b336 100644 --- a/sre/roles/recorders/tasks/uninstall_topology_recorders.yaml +++ b/sre/roles/recorders/tasks/uninstall_topology_recorders.yaml @@ -3,4 +3,4 @@ ansible.builtin.import_tasks: file: uninstall_topology_recorders_kubernetes.yaml when: - - recorders_enabled.alerts.kubernetes | default(true) + - recorders_enabled.topology.kubernetes | default(true) From 174306c1e73ffcc703f964ba7527664390cd9633 Mon Sep 17 00:00:00 2001 From: Gerard Vanloo Date: Thu, 14 Aug 2025 10:46:09 -0400 Subject: [PATCH 09/35] fix: update incident smoke tests to focus on fault injection and removal Signed-off-by: Gerard Vanloo --- .../sre-integration-smoke-tests.yaml | 182 +++++++++++++----- 1 file changed, 130 insertions(+), 52 deletions(-) diff --git a/.github/workflows/sre-integration-smoke-tests.yaml b/.github/workflows/sre-integration-smoke-tests.yaml index c4797e4d..9676bb28 100644 --- a/.github/workflows/sre-integration-smoke-tests.yaml +++ b/.github/workflows/sre-integration-smoke-tests.yaml @@ -558,12 +558,18 @@ jobs: - name: Create group vars run: | make -C sre group_vars - - name: Test incident start + - name: Create environment run: | - INCIDENT_NUMBER=3 make -C sre start_incident - - name: Test incident stop + INCIDENT_NUMBER=3 make -C sre create_environment + - name: Test fault injection run: | - INCIDENT_NUMBER=3 make -C sre stop_incident + INCIDENT_NUMBER=3 make -C sre inject_incident_fault + - name: Test fault removal + run: | + INCIDENT_NUMBER=3 make -C sre remove_incident_fault + - name: Destroy environment + run: | + INCIDENT_NUMBER=3 make -C sre destroy_environment sre-incident-16: name: SRE Incident 16 Smoke Test needs: @@ -593,12 +599,18 @@ jobs: - name: Create group vars run: | make -C sre group_vars - - name: Test incident start + - name: Create environment + run: | + INCIDENT_NUMBER=16 make -C sre create_environment + - name: Test fault injection + run: | + INCIDENT_NUMBER=16 make -C sre inject_incident_fault + - name: Test fault removal run: | - INCIDENT_NUMBER=16 make -C sre start_incident - - name: Test incident stop + INCIDENT_NUMBER=16 make -C sre remove_incident_fault + - name: Destroy environment run: | - INCIDENT_NUMBER=16 make -C sre stop_incident + INCIDENT_NUMBER=16 make -C sre destroy_environment sre-incident-20: name: SRE Incident 20 Smoke Test needs: @@ -628,12 +640,18 @@ jobs: - name: Create group vars run: | make -C sre group_vars - - name: Test incident start + - name: Create environment run: | - INCIDENT_NUMBER=20 make -C sre start_incident - - name: Test incident stop + INCIDENT_NUMBER=20 make -C sre create_environment + - name: Test fault injection run: | - INCIDENT_NUMBER=20 make -C sre stop_incident + INCIDENT_NUMBER=20 make -C sre inject_incident_fault + - name: Test fault removal + run: | + INCIDENT_NUMBER=20 make -C sre remove_incident_fault + - name: Destroy environment + run: | + INCIDENT_NUMBER=20 make -C sre destroy_environment sre-incident-23: name: SRE Incident 23 Smoke Test needs: @@ -663,12 +681,18 @@ jobs: - name: Create group vars run: | make -C sre group_vars - - name: Test incident start + - name: Create environment + run: | + INCIDENT_NUMBER=23 make -C sre create_environment + - name: Test fault injection run: | - INCIDENT_NUMBER=23 make -C sre start_incident - - name: Test incident stop + INCIDENT_NUMBER=23 make -C sre inject_incident_fault + - name: Test fault removal run: | - INCIDENT_NUMBER=23 make -C sre stop_incident + INCIDENT_NUMBER=23 make -C sre remove_incident_fault + - name: Destroy environment + run: | + INCIDENT_NUMBER=23 make -C sre destroy_environment sre-incident-26: name: SRE Incident 26 Smoke Test needs: @@ -699,12 +723,18 @@ jobs: - name: Create group vars run: | make -C sre group_vars - - name: Test incident start + - name: Create environment + run: | + INCIDENT_NUMBER=26 make -C sre create_environment + - name: Test fault injection + run: | + INCIDENT_NUMBER=26 make -C sre inject_incident_fault + - name: Test fault removal run: | - INCIDENT_NUMBER=26 make -C sre start_incident - - name: Test incident stop + INCIDENT_NUMBER=26 make -C sre remove_incident_fault + - name: Destroy environment run: | - INCIDENT_NUMBER=26 make -C sre stop_incident + INCIDENT_NUMBER=26 make -C sre destroy_environment sre-incident-30: name: SRE Incident 30 Smoke Test needs: @@ -734,12 +764,18 @@ jobs: - name: Create group vars run: | make -C sre group_vars - - name: Test incident start + - name: Create environment run: | - INCIDENT_NUMBER=30 make -C sre start_incident - - name: Test incident stop + INCIDENT_NUMBER=30 make -C sre create_environment + - name: Test fault injection run: | - INCIDENT_NUMBER=30 make -C sre stop_incident + INCIDENT_NUMBER=30 make -C sre inject_incident_fault + - name: Test fault removal + run: | + INCIDENT_NUMBER=30 make -C sre remove_incident_fault + - name: Destroy environment + run: | + INCIDENT_NUMBER=30 make -C sre destroy_environment sre-incident-31: name: SRE Incident 31 Smoke Test needs: @@ -769,12 +805,18 @@ jobs: - name: Create group vars run: | make -C sre group_vars - - name: Test incident start + - name: Create environment + run: | + INCIDENT_NUMBER=31 make -C sre create_environment + - name: Test fault injection run: | - INCIDENT_NUMBER=31 make -C sre start_incident - - name: Test incident stop + INCIDENT_NUMBER=31 make -C sre inject_incident_fault + - name: Test fault removal run: | - INCIDENT_NUMBER=31 make -C sre stop_incident + INCIDENT_NUMBER=31 make -C sre remove_incident_fault + - name: Destroy environment + run: | + INCIDENT_NUMBER=31 make -C sre destroy_environment sre-incident-33: name: SRE Incident 33 Smoke Test needs: @@ -804,12 +846,18 @@ jobs: - name: Create group vars run: | make -C sre group_vars - - name: Test incident start + - name: Create environment + run: | + INCIDENT_NUMBER=33 make -C sre create_environment + - name: Test fault injection run: | - INCIDENT_NUMBER=33 make -C sre start_incident - - name: Test incident stop + INCIDENT_NUMBER=33 make -C sre inject_incident_fault + - name: Test fault removal run: | - INCIDENT_NUMBER=33 make -C sre stop_incident + INCIDENT_NUMBER=33 make -C sre remove_incident_fault + - name: Destroy environment + run: | + INCIDENT_NUMBER=33 make -C sre destroy_environment sre-incident-34: name: SRE Incident 34 Smoke Test needs: @@ -839,12 +887,18 @@ jobs: - name: Create group vars run: | make -C sre group_vars - - name: Test incident start + - name: Create environment + run: | + INCIDENT_NUMBER=34 make -C sre create_environment + - name: Test fault injection run: | - INCIDENT_NUMBER=34 make -C sre start_incident - - name: Test incident stop + INCIDENT_NUMBER=34 make -C sre inject_incident_fault + - name: Test fault removal run: | - INCIDENT_NUMBER=34 make -C sre stop_incident + INCIDENT_NUMBER=34 make -C sre remove_incident_fault + - name: Destroy environment + run: | + INCIDENT_NUMBER=34 make -C sre destroy_environment sre-incident-102: name: SRE Incident 102 Smoke Test needs: @@ -874,12 +928,18 @@ jobs: - name: Create group vars run: | make -C sre group_vars - - name: Test incident start + - name: Create environment + run: | + INCIDENT_NUMBER=102 make -C sre create_environment + - name: Test fault injection + run: | + INCIDENT_NUMBER=102 make -C sre inject_incident_fault + - name: Test fault removal run: | - INCIDENT_NUMBER=102 make -C sre start_incident - - name: Test incident stop + INCIDENT_NUMBER=102 make -C sre remove_incident_fault + - name: Destroy environment run: | - INCIDENT_NUMBER=102 make -C sre stop_incident + INCIDENT_NUMBER=102 make -C sre destroy_environment sre-incident-105: name: SRE Incident 105 Smoke Test needs: @@ -909,12 +969,18 @@ jobs: - name: Create group vars run: | make -C sre group_vars - - name: Test incident start + - name: Create environment run: | - INCIDENT_NUMBER=105 make -C sre start_incident - - name: Test incident stop + INCIDENT_NUMBER=105 make -C sre create_environment + - name: Test fault injection run: | - INCIDENT_NUMBER=105 make -C sre stop_incident + INCIDENT_NUMBER=105 make -C sre inject_incident_fault + - name: Test fault removal + run: | + INCIDENT_NUMBER=105 make -C sre remove_incident_fault + - name: Destroy environment + run: | + INCIDENT_NUMBER=105 make -C sre destroy_environment finops-incident-37: name: FinOps Incident 37 Smoke Test needs: @@ -946,12 +1012,18 @@ jobs: - name: Create group vars run: | make -C sre group_vars - - name: Test incident start + - name: Create environment + run: | + INCIDENT_NUMBER=37 make -C sre create_environment + - name: Test fault injection run: | - INCIDENT_NUMBER=37 make -C sre start_incident - - name: Test incident stop + INCIDENT_NUMBER=37 make -C sre inject_incident_fault + - name: Test fault removal run: | - INCIDENT_NUMBER=37 make -C sre stop_incident + INCIDENT_NUMBER=37 make -C sre remove_incident_fault + - name: Destroy environment + run: | + INCIDENT_NUMBER=37 make -C sre destroy_environment finops-incident-38: name: FinOps Incident 38 Smoke Test needs: @@ -983,9 +1055,15 @@ jobs: - name: Create group vars run: | make -C sre group_vars - - name: Test incident start + - name: Create environment + run: | + INCIDENT_NUMBER=38 make -C sre create_environment + - name: Test fault injection + run: | + INCIDENT_NUMBER=38 make -C sre inject_incident_fault + - name: Test fault removal run: | - INCIDENT_NUMBER=38 make -C sre start_incident - - name: Test incident stop + INCIDENT_NUMBER=38 make -C sre remove_incident_fault + - name: Destroy environment run: | - INCIDENT_NUMBER=38 make -C sre stop_incident + INCIDENT_NUMBER=38 make -C sre destroy_environment From 789151809a8e5a4eed4235930c8a76750cecbba9 Mon Sep 17 00:00:00 2001 From: Gerard Vanloo Date: Thu, 14 Aug 2025 11:04:27 -0400 Subject: [PATCH 10/35] fix: correct jaeger collector otlp grpc endpoint Signed-off-by: Gerard Vanloo --- sre/roles/applications/tasks/install_otel_demo.yaml | 2 +- sre/roles/tools/tasks/set_jaeger_endpoint.yaml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sre/roles/applications/tasks/install_otel_demo.yaml b/sre/roles/applications/tasks/install_otel_demo.yaml index 4c3faa3b..aa1ef112 100644 --- a/sre/roles/applications/tasks/install_otel_demo.yaml +++ b/sre/roles/applications/tasks/install_otel_demo.yaml @@ -294,7 +294,7 @@ http: endpoint: http://{{ helm_releases.opensearch.name }}-master.{{ helm_releases.opensearch.namespace }}.svc.cluster.local:9200 otlp: - endpoint: "{{ tools_jaeger_collector_otlp_endpoint }}" + endpoint: "{{ tools_jaeger_collector_otlp_grpc_endpoint }}" otlphttp/prometheus: endpoint: http://{{ helm_releases.prometheus.name }}-kube-prometheus-prometheus.{{ helm_releases.prometheus.namespace }}.svc.cluster.local:9090 prometheus: diff --git a/sre/roles/tools/tasks/set_jaeger_endpoint.yaml b/sre/roles/tools/tasks/set_jaeger_endpoint.yaml index b88b97db..07844a6c 100644 --- a/sre/roles/tools/tasks/set_jaeger_endpoint.yaml +++ b/sre/roles/tools/tasks/set_jaeger_endpoint.yaml @@ -10,8 +10,8 @@ - name: Construct Jaeger in-cluster endpoints ansible.builtin.set_fact: - tools_jaeger_collector_otlp_endpoint: http://{{ tools_jaeger_service_info.resources[0].metadata.name }}.{{ - tools_jaeger_service_info.resources[0].metadata.namespace }}.svc.cluster.local:4317 + tools_jaeger_collector_otlp_grpc_endpoint: "{{ tools_jaeger_service_info.resources[0].metadata.name }}.{{ + tools_jaeger_service_info.resources[0].metadata.namespace }}.svc.cluster.local:4317" tools_jaeger_querier_endpoint: http://{{ tools_jaeger_service_info.resources[0].metadata.name }}.{{ tools_jaeger_service_info.resources[0].metadata.namespace }}.svc.cluster.local:16686 when: From f66579640ca75d35a73cf5516537e850bec7f0cc Mon Sep 17 00:00:00 2001 From: Gerard Vanloo Date: Fri, 15 Aug 2025 09:48:45 -0400 Subject: [PATCH 11/35] fix: correct jaeger query endpoint Signed-off-by: Gerard Vanloo --- .github/workflows/sre-lint-playbooks.yaml | 2 +- sre/Makefile | 4 +- sre/playbooks/manage_recorders.yaml | 24 ++---------- .../files/kubernetes/topology/kubernetes.yaml | 2 +- .../files/kubernetes/traces/jaeger.yaml | 2 +- .../files/scripts/traces/jaeger/gather.py | 39 ++++++++++--------- .../uninstall_traces_recorders_jaeger.yaml | 2 +- .../tools/tasks/set_jaeger_endpoint.yaml | 2 +- 8 files changed, 30 insertions(+), 47 deletions(-) diff --git a/.github/workflows/sre-lint-playbooks.yaml b/.github/workflows/sre-lint-playbooks.yaml index b3604486..be615fb3 100644 --- a/.github/workflows/sre-lint-playbooks.yaml +++ b/.github/workflows/sre-lint-playbooks.yaml @@ -26,7 +26,7 @@ jobs: - name: Run ansible-lint uses: ansible/ansible-lint@v25.8.1 with: - args: playbooks/manage_applications.yaml playbooks/manage_tools.yaml playbooks/manage_incidents.yaml + args: playbooks/manage_applications.yaml playbooks/manage_tools.yaml playbooks/manage_incidents.yaml playbooks/manage_recorders.yaml setup_python: "false" working_directory: sre requirements_file: requirements.yaml diff --git a/sre/Makefile b/sre/Makefile index 668277ba..4491bd4e 100644 --- a/sre/Makefile +++ b/sre/Makefile @@ -105,10 +105,10 @@ create_environment: deploy_tools deploy_applications ## Deploys tools and applic destroy_environment: undeploy_applications undeploy_tools ## Undeploys tools and applications to cluster .PHONY: start_incident -start_incident: create_environment inject_incident_fault deploy_recorders ## Starts an incident by deploying a stack, application, and fault for an incident +start_incident: create_environment inject_incident_fault deploy_recorders ## Starts an incident by deploying a stack, applications, faults, and recorders for an incident .PHONY: stop_incident -stop_incident: undeploy_recorders remove_incident_fault destroy_environment ## Stops an incident by undeploying a stack, application, and fault for an incident +stop_incident: undeploy_recorders remove_incident_fault destroy_environment ## Stops an incident by undeploying a stack, applications, faults, and recorders for an incident .PHONY: deploy_awx_stack deploy_awx_stack: ## Deploys AWX to a cluster diff --git a/sre/playbooks/manage_recorders.yaml b/sre/playbooks/manage_recorders.yaml index fddb2094..0715f78c 100644 --- a/sre/playbooks/manage_recorders.yaml +++ b/sre/playbooks/manage_recorders.yaml @@ -34,24 +34,6 @@ id: "{{ incident_id }}" when: - incident_id is defined - - - name: Create tools_enabled dictionary - ansible.builtin.set_fact: - tools_enabled: - chaos_mesh: "{{ tools.chaos_mesh | default(false) }}" - clickhouse: "{{ tools.clickhouse | default(false) }}" - ingress: "{{ tools.ingress | default(false) }}" - jaeger: "{{ tools.jaeger | default(false) }}" - kubernetes_metrics_server: "{{ tools.kubernetes_metrics_server | default(false) }}" - kubernetes_topology_monitor: "{{ tools.kubernetes_topology_monitor | default(false) }}" - opencost: "{{ tools.opencost | default(false) }}" - opensearch: "{{ tools.opensearch | default(false) }}" - opentelemetry: "{{ tools.opentelemetry | default(false) }}" - prometheus: "{{ tools.prometheus | default(false) }}" - tags: - - always - when: - - incident_id is undefined tasks: - name: Import recorders role ansible.builtin.import_role: @@ -62,8 +44,8 @@ platform: "{{ cluster_platform }}" recorders_enabled: alerts: - prometheus: "{{ tools_enabled.prometheus }}" + prometheus: "{{ tools_enabled.prometheus | default(tools.prometheus) }}" topology: - kubernetes: "{{ tools_enabled.kubernetes_topology_monitor }}" + kubernetes: "{{ tools_enabled.kubernetes_topology_monitor | default(tools.kubernetes_topology_monitor) }}" traces: - jaeger: "{{ tools_enabled.jaeger }}" + jaeger: "{{ tools_enabled.jaeger | default(tools.jaeger) }}" diff --git a/sre/roles/recorders/files/kubernetes/topology/kubernetes.yaml b/sre/roles/recorders/files/kubernetes/topology/kubernetes.yaml index ee11a116..7393231c 100644 --- a/sre/roles/recorders/files/kubernetes/topology/kubernetes.yaml +++ b/sre/roles/recorders/files/kubernetes/topology/kubernetes.yaml @@ -66,6 +66,6 @@ spec: - ReadWriteOnce resources: requests: - storage: 10Gi + storage: 5Gi persistentVolumeClaimRetentionPolicy: whenDeleted: Delete diff --git a/sre/roles/recorders/files/kubernetes/traces/jaeger.yaml b/sre/roles/recorders/files/kubernetes/traces/jaeger.yaml index f92df936..53167db3 100644 --- a/sre/roles/recorders/files/kubernetes/traces/jaeger.yaml +++ b/sre/roles/recorders/files/kubernetes/traces/jaeger.yaml @@ -66,6 +66,6 @@ spec: - ReadWriteOnce resources: requests: - storage: 1Gi + storage: 5Gi persistentVolumeClaimRetentionPolicy: whenDeleted: Delete diff --git a/sre/roles/recorders/files/scripts/traces/jaeger/gather.py b/sre/roles/recorders/files/scripts/traces/jaeger/gather.py index 6fd49dfb..1974bb25 100644 --- a/sre/roles/recorders/files/scripts/traces/jaeger/gather.py +++ b/sre/roles/recorders/files/scripts/traces/jaeger/gather.py @@ -19,7 +19,7 @@ def get_services(session, endpoint, headers): response = session.get( - "{0}/api/services".format(endpoint), + "{0}/api/v3/services".format(endpoint), headers=headers, verify=True ) @@ -29,11 +29,11 @@ def get_services(session, endpoint, headers): return [] content = response.json() - return content.get("data", []) + return content.get("services", []) def get_operations(session, endpoint, headers, service): response = session.get( - "{0}/api/operations".format(endpoint), + "{0}/api/v3/operations".format(endpoint), headers=headers, params={ "service": service, @@ -46,7 +46,7 @@ def get_operations(session, endpoint, headers, service): return [] content = response.json() - return content.get("data", []) + return content.get("operations", []) def get_traces(session, endpoint, headers, service, operation, time_window): name = operation.get("name") @@ -55,14 +55,14 @@ def get_traces(session, endpoint, headers, service, operation, time_window): return [] response = session.get( - "{0}/api/traces".format(endpoint), + "{0}/api/v3/traces".format(endpoint), headers=headers, params={ - "service": service, - "operation": name, - "start": time_window[0], - "end": time_window[1], - "limit": 1 + "query.service_name": service, + "query.operation_name": name, + "query.start_time_min": time_window[0], + "query.start_time_max": time_window[1], + "query.num_traces": "1" }, verify=True ) @@ -72,7 +72,7 @@ def get_traces(session, endpoint, headers, service, operation, time_window): return [] content = response.json() - return content.get("data", []) + return content.get("result", {}).get("resourceSpans", []) def main(): endpoint = os.environ.get("JAEGER_ENDPOINT") @@ -88,16 +88,17 @@ def main(): session.mount("http://", adapter) session.mount("https://", adapter) - logger.info("sleeping for 5 minutes to wait for Jaeger to receive data") - time.sleep(300) - while True: - next_datetime = datetime.now() + timedelta(seconds=300) + current_datetime = datetime.now() + last_datetime = current_datetime - timedelta(seconds=300) + next_datetime = current_datetime + timedelta(seconds=300) traces = [] - end_time = int(time.time_ns() // 1000) - start_time = end_time - (300 * 1_000_000) + time_window = ( + "{0}000Z".format(last_datetime.isoformat(timespec='microseconds')), + "{0}000Z".format(current_datetime.isoformat(timespec='microseconds')) + ) services = get_services(session, endpoint, headers) logger.info("retrieved {0} services from jaeger".format(len(services))) @@ -107,8 +108,8 @@ def main(): logger.info("retrieved {0} operations from jaeger".format(len(operations))) for operation in operations: - t = get_traces(session, endpoint, headers, service, operation, (start_time, end_time)) - logger.info("retrieved {0} traces from jaeger".format(len(ts))) + t = get_traces(session, endpoint, headers, service, operation, time_window) + logger.info("retrieved {0} traces from jaeger".format(len(t))) traces.extend(t) diff --git a/sre/roles/recorders/tasks/uninstall_traces_recorders_jaeger.yaml b/sre/roles/recorders/tasks/uninstall_traces_recorders_jaeger.yaml index 578105db..a559c5e3 100644 --- a/sre/roles/recorders/tasks/uninstall_traces_recorders_jaeger.yaml +++ b/sre/roles/recorders/tasks/uninstall_traces_recorders_jaeger.yaml @@ -1,5 +1,5 @@ --- -- name: Retrieve the topology recorder pod name +- name: Retrieve the traces recorder pod name kubernetes.core.k8s_info: api_version: v1 kind: Pod diff --git a/sre/roles/tools/tasks/set_jaeger_endpoint.yaml b/sre/roles/tools/tasks/set_jaeger_endpoint.yaml index 07844a6c..1e577f6b 100644 --- a/sre/roles/tools/tasks/set_jaeger_endpoint.yaml +++ b/sre/roles/tools/tasks/set_jaeger_endpoint.yaml @@ -13,7 +13,7 @@ tools_jaeger_collector_otlp_grpc_endpoint: "{{ tools_jaeger_service_info.resources[0].metadata.name }}.{{ tools_jaeger_service_info.resources[0].metadata.namespace }}.svc.cluster.local:4317" tools_jaeger_querier_endpoint: http://{{ tools_jaeger_service_info.resources[0].metadata.name }}.{{ - tools_jaeger_service_info.resources[0].metadata.namespace }}.svc.cluster.local:16686 + tools_jaeger_service_info.resources[0].metadata.namespace }}.svc.cluster.local:16686/jaeger when: - tools_jaeger_service_info is defined - tools_jaeger_service_info.resources | length == 1 From afb29e424e7baa65abfb1c2fee9619899c6725b4 Mon Sep 17 00:00:00 2001 From: Gerard Vanloo Date: Fri, 15 Aug 2025 09:54:23 -0400 Subject: [PATCH 12/35] bump: update ansible-lint to latest Signed-off-by: Gerard Vanloo --- requirements-dev.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index 047357c0..1fcf8582 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,4 +1,4 @@ -ansible-lint==25.7.0 +ansible-lint==25.8.1 commitizen==4.8.3 detect-secrets==1.5.0 pre_commit==4.3.0 From 1aa7e0c39a5679f8a8f9087a61c9f8daa9b86b8e Mon Sep 17 00:00:00 2001 From: Gerard Vanloo Date: Fri, 15 Aug 2025 09:55:54 -0400 Subject: [PATCH 13/35] chore: remove jaeger cronjob Signed-off-by: Gerard Vanloo --- sre/cron_jobs/gather_traces.py | 189 -------------------- sre/cron_jobs/leverage_ingress.yaml | 23 --- sre/cron_jobs/leverage_port_forwarding.yaml | 68 ------- sre/cron_jobs/trace_recorder.yaml | 39 ---- 4 files changed, 319 deletions(-) delete mode 100644 sre/cron_jobs/gather_traces.py delete mode 100644 sre/cron_jobs/leverage_ingress.yaml delete mode 100644 sre/cron_jobs/leverage_port_forwarding.yaml delete mode 100644 sre/cron_jobs/trace_recorder.yaml diff --git a/sre/cron_jobs/gather_traces.py b/sre/cron_jobs/gather_traces.py deleted file mode 100644 index 4bdc4eb3..00000000 --- a/sre/cron_jobs/gather_traces.py +++ /dev/null @@ -1,189 +0,0 @@ -#!/usr/bin/env python3 -# Source: https://github.ibm.com/Saurabh-Jha/NTAM/blob/main/gather_traces.py -import argparse -import json -import logging -import os -import time -from typing import Any, Dict, Optional - -import requests -from requests.adapters import HTTPAdapter -from urllib3.util.retry import Retry - -# Configure logging -logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") -logger = logging.getLogger(__name__) - -REQUEST_TIMEOUT = 120 - -def create_session() -> requests.Session: - """ - Create a requests.Session with a retry strategy. - """ - session = requests.Session() - retries = Retry( - total=3, # number of total retries - backoff_factor=0.3, # factor for time between retries - status_forcelist=[500, 502, 503, 504] - ) - adapter = HTTPAdapter(max_retries=retries) - session.mount("http://", adapter) - session.mount("https://", adapter) - return session - - -def get_services( - session: requests.Session, - jaeger_url: str, - token: str, -) -> Dict[str, Any]: - """ - Fetch the list of services from Jaeger. - """ - url = f"{jaeger_url}/api/services" - headers = { - "Authorization": f"Bearer {token}", - "Content-Type": "application/json", - } - - try: - resp = session.get(url, headers=headers, timeout=REQUEST_TIMEOUT) - resp.raise_for_status() - return resp.json() # e.g., { "total": int, "data": [service1, service2, ...] } - except Exception as e: - logger.error(f"Error fetching services: {e}") - return {} - -def get_operations( - session: requests.Session, - jaeger_url: str, - token: str, - service: str -) -> Dict[str, Any]: - """ - Fetch the list of operations for a given service from Jaeger. - """ - url = f"{jaeger_url}/api/operations" - headers = { - "Authorization": f"Bearer {token}", - "Content-Type": "application/json" - } - params = {"service": service} - - try: - resp = session.get(url, headers=headers, params=params, timeout=REQUEST_TIMEOUT) - resp.raise_for_status() - return resp.json() # e.g., { "total": int, "data": [ { "name": ...}, ...] } - except Exception as e: - logger.error(f"Error fetching operations for service '{service}': {e}") - return {} - -def get_traces( - session: requests.Session, - jaeger_url: str, - token: str, - service: str, - operation: Optional[str], - start_time: int, - end_time: int, - limit: int = 1 -) -> Dict[str, Any]: - """ - Query Jaeger traces for a given service & operation over a time window. - """ - url = f"{jaeger_url}/api/traces" - headers = { - "Authorization": f"Bearer {token}", - "Content-Type": "application/json" - } - params = { - "service": service, - "operation": operation, - "start": start_time, - "end": end_time, - "limit": limit - } - - try: - resp = session.get(url, headers=headers, params=params, timeout=REQUEST_TIMEOUT) - resp.raise_for_status() - return resp.json() # typically { "data": [ ... ] } - except Exception as e: - logger.error(f"Error fetching traces for service '{service}', operation '{operation}': {e}") - return {} - -def main(): - parser = argparse.ArgumentParser(description="Collect one trace per service/operation from Jaeger.") - parser.add_argument( - "--jaeger_url", - required=True, - help="Jaeger base URL (e.g., https://jaeger-domain)" - ) - parser.add_argument( - "--jaeger_token", - required=True, - help="Jaeger service account token." - ) - parser.add_argument( - "--time_window", - type=int, - default=300, - help="Time window in seconds for the last traces. Default is 300 (5 minutes)." - ) - parser.add_argument( - "--output_file", - type=str, - default="traces.json", - help="File path to store collected traces as JSON. Default is 'traces.json'." - ) - args = parser.parse_args() - - # Prepare the time range in microseconds - end_time = int(time.time_ns() // 1000) - start_time = end_time - (args.time_window * 1_000_000) # from now - time_window in microseconds - - session = create_session() - - # Retrieve services - services_data = get_services(session, args.jaeger_url, args.jaeger_token) - if not services_data or "data" not in services_data: - logger.info("No services found or unable to fetch services.") - return - - all_traces = {"data": []} - - # Iterate over services & operations, collect up to one trace each - for service in services_data["data"]: - operations_data = get_operations(session, args.jaeger_url, args.jaeger_token, service) - if not operations_data or "data" not in operations_data: - logger.info(f"No operations found or unable to fetch operations for service: {service}") - continue - - for op in operations_data["data"]: - operation_name = op.get("name", "") - logger.info(f"Fetching trace for service '{service}' and operation '{operation_name}'") - trace_json = get_traces( - session, - args.jaeger_url, - args.jaeger_token, - service, - operation_name, - start_time, - end_time, - limit=1 - ) - # Merge new data - if "data" in trace_json: - all_traces["data"].extend(trace_json["data"]) - - # Write to output if we have traces - if all_traces["data"]: - with open(args.output_file, "w") as f: - json.dump(all_traces, f, indent=2) - logger.info(f"Traces successfully written to {args.output_file}") - else: - logger.info("No traces retrieved within the specified time window.") - -if __name__ == "__main__": - main() diff --git a/sre/cron_jobs/leverage_ingress.yaml b/sre/cron_jobs/leverage_ingress.yaml deleted file mode 100644 index f525c350..00000000 --- a/sre/cron_jobs/leverage_ingress.yaml +++ /dev/null @@ -1,23 +0,0 @@ ---- -- name: Get the Ingress URL - ansible.builtin.shell: "KUBECONFIG={{ kubeconfig }} kubectl get ingress prometheus -n {{ prometheus_namespace_project_name }} -o json" - register: observability_stack_ingress - retries: 5 - delay: 60 - until: (observability_stack_ingress.stdout | length) > 0 - ignore_errors: yes - -- name: Extract the Ingress hostname information - set_fact: - ingress_hostname: "{{ observability_stack_ingress.stdout | from_json | json_query('status.loadBalancer.ingress[0].hostname') }}" - when: observability_stack_ingress.stdout | trim != '' - -- name: Set the Prometheus URL - set_fact: - prometheus_url: "http://{{ ingress_hostname }}/prometheus" - when: ingress_hostname is defined and ingress_hostname | trim != '' - -- name: Set the Jaeger URL - set_fact: - jaeger_url: "http://{{ ingress_hostname }}/jaeger" - when: ingress_hostname is defined and ingress_hostname | trim != '' diff --git a/sre/cron_jobs/leverage_port_forwarding.yaml b/sre/cron_jobs/leverage_port_forwarding.yaml deleted file mode 100644 index 472d829b..00000000 --- a/sre/cron_jobs/leverage_port_forwarding.yaml +++ /dev/null @@ -1,68 +0,0 @@ ---- -- name: Check availability of ports - ansible.builtin.shell: | - lsof -i :{{ item }} > /dev/null && echo "in_use" || echo "available" - register: lsof_check - loop: "{{ range(32100, 32125) | list }}" - changed_when: false - failed_when: false - loop_control: - loop_var: item - when: ingress_hostname is undefined or ingress_hostname | trim == '' - -- name: Assign a dynamic port if one is available - set_fact: - dynamic_port_for_prometheus: "{{ (lsof_check.results | selectattr('stdout', 'equalto', 'available') | map(attribute='item') | list | first) }}" - when: ingress_hostname is undefined or ingress_hostname | trim == '' - -- name: Kubectl port-forward on/for the prometheus-server service with dynamic port - ansible.builtin.shell: KUBECONFIG={{ kubeconfig }} kubectl -n "{{ prometheus_namespace_project_name }}" port-forward "svc/prometheus-server" "{{ dynamic_port_for_prometheus }}:80" --request-timeout=10m - async: 600 - poll: 0 - register: prometheus_port_forward_for_datasources_creation - when: ingress_hostname is undefined or ingress_hostname | trim == '' - -- name: Wait for port-forward to be available - wait_for_connection: - delay: 5 - timeout: 30 - when: ingress_hostname is undefined or ingress_hostname | trim == '' - -- name: Set the Prometheus URL - set_fact: - prometheus_url: "http://127.0.0.1:{{ dynamic_port_for_prometheus }}" - when: ingress_hostname is undefined or ingress_hostname | trim == '' - -- name: Kubectl port-forward on/for Jaeger collector with dynamic port - ansible.builtin.shell: KUBECONFIG={{ kubeconfig }} kubectl -n "{{ opentelemetry_operator_collectors_namespace }}" port-forward "svc/jaeger-collector" "{{ dynamic_port_for_jaeger }}:16686" --request-timeout=10m - async: 600 - poll: 0 - register: jaeger_port_forward - when: ingress_hostname is undefined or ingress_hostname | trim == '' - -- name: Check availability of ports - ansible.builtin.shell: | - lsof -i :{{ item }} > /dev/null && echo "in_use" || echo "available" - register: lsof_check - loop: "{{ range(32100, 32125) | list }}" - changed_when: false - failed_when: false - loop_control: - loop_var: item - when: ingress_hostname is undefined or ingress_hostname | trim == '' - -- name: Assign a dynamic port if one is available - set_fact: - dynamic_port_for_jaeger: "{{ (lsof_check.results | selectattr('stdout', 'equalto', 'available') | map(attribute='item') | list | first) }}" - when: ingress_hostname is undefined or ingress_hostname | trim == '' - -- name: Wait for port-forward to be available - wait_for_connection: - delay: 5 - timeout: 30 - when: ingress_hostname is undefined or ingress_hostname | trim == '' - -- name: Set the Jaeger URL - set_fact: - jaeger_url: "http://127.0.0.1:{{ dynamic_port_for_jaeger }}" - when: ingress_hostname is undefined or ingress_hostname | trim == '' diff --git a/sre/cron_jobs/trace_recorder.yaml b/sre/cron_jobs/trace_recorder.yaml deleted file mode 100644 index 71726324..00000000 --- a/sre/cron_jobs/trace_recorder.yaml +++ /dev/null @@ -1,39 +0,0 @@ ---- -- name: Trace Extractor - hosts: localhost - vars: - prometheus_namespace_project_name: prometheus - opentelemetry_operator_collectors_namespace: opentelemetry-collectors - tasks: - - name: Sleep for 300 seconds and continue with play - ansible.builtin.pause: - seconds: 30 - - - name: (Hack) Kubeconfig path inside AWX-EE container - ansible.builtin.shell: find /runner/env -type f -size +3072c - register: kubeconfig - - - name: Tasks associated with leveraging ingress - ansible.builtin.include_tasks: - file: leverage_ingress.yaml - - - name: Tasks associated with leveraging port forwarding - ansible.builtin.include_tasks: - file: leverage_port_forwarding.yaml - - - name: Run traces.py from https://github.ibm.com/Saurabh-Jha/NTAM/blob/main/gather_traces.py - ansible.builtin.shell: "python /runner/project/sre/cron_jobs/gather_traces.py --jaeger_url {{ jaeger_url }} --jaeger_token 'NOT_NEEDED' --output_file /runner/traces.json" - - - name: Check if traces.json exists - ansible.builtin.stat: - path: "/runner/traces.json" - register: traces_json_stat - - - name: Upload traces.json to S3 - amazon.aws.s3_object: - endpoint_url: "{{ s3_endpoint_url }}" - bucket: "{{ s3_bucket_name_for_results }}" - object: "/{{ sre_agent_name__version_number }}/{{run_uuid}}/{{scenario_number}}/{{run_number}}/traces.json" - src: "/runner/traces.json" - mode: put - when: traces_json_stat.stat.exists From 2a5bb29c564693f8136701e4cece8709410e7e15 Mon Sep 17 00:00:00 2001 From: Gerard Vanloo Date: Fri, 15 Aug 2025 10:11:49 -0400 Subject: [PATCH 14/35] chore: remove unneeded common role Signed-off-by: Gerard Vanloo --- sre/roles/common/meta/argument_specs.yaml | 57 ------------------- .../common/tasks/set_ingress_hostname.yaml | 31 ---------- 2 files changed, 88 deletions(-) delete mode 100644 sre/roles/common/meta/argument_specs.yaml delete mode 100644 sre/roles/common/tasks/set_ingress_hostname.yaml diff --git a/sre/roles/common/meta/argument_specs.yaml b/sre/roles/common/meta/argument_specs.yaml deleted file mode 100644 index bf211be9..00000000 --- a/sre/roles/common/meta/argument_specs.yaml +++ /dev/null @@ -1,57 +0,0 @@ ---- -argument_specs: - check_kubectl_version: - author: - - Gerard Vanloo - options: - common_cluster: - required: true - type: dict - options: - kubeconfig: - required: true - type: str - set_cluster_platform: - author: - - Gerard Vanloo - options: - common_cluster: - required: true - type: dict - options: - kubeconfig: - required: true - type: str - set_cluster_provider: - author: - - Gerard Vanloo - options: - common_cluster: - required: true - type: dict - options: - kubeconfig: - required: true - type: str - set_ingress_hostname: - short_description: Task for getting Ingress hostnames - author: - - Gerard Vanloo - options: - common_cluster: - required: true - type: dict - options: - kubeconfig: - required: true - type: str - common_ingress: - required: true - type: dict - options: - name: - required: true - type: str - namespace: - required: true - type: str diff --git a/sre/roles/common/tasks/set_ingress_hostname.yaml b/sre/roles/common/tasks/set_ingress_hostname.yaml deleted file mode 100644 index a1767333..00000000 --- a/sre/roles/common/tasks/set_ingress_hostname.yaml +++ /dev/null @@ -1,31 +0,0 @@ ---- -- name: Get the Ingress object - kubernetes.core.k8s_info: - api_version: networking.k8s.io/v1 - kind: Ingress - name: "{{ common_ingress.name }}" - namespace: "{{ common_ingress.namespace }}" - kubeconfig: "{{ common_cluster.kubeconfig }}" - wait: true - register: ingress_info - until: - - ingress_info.resources[0].status.loadBalancer.ingress is defined - delay: 15 - retries: 12 - -- name: Extract the Ingress hostname information - ansible.builtin.set_fact: - ingress_hostname: "{{ ingress_info.resources[0].status.loadBalancer.ingress[0].hostname }}" - when: - - ingress_info.resources[0].status.loadBalancer.ingress is defined - - ingress_info.resources[0].status.loadBalancer.ingress | length > 0 - - ingress_info.resources[0].status.loadBalancer.ingress[0].hostname is defined - -- name: Extract the Ingress IP information - ansible.builtin.set_fact: - ingress_hostname: "{{ ingress_info.resources[0].status.loadBalancer.ingress[0].ip }}" - when: - - ingress_info.resources[0].status.loadBalancer.ingress is defined - - ingress_info.resources[0].status.loadBalancer.ingress | length > 0 - - ingress_info.resources[0].status.loadBalancer.ingress[0].hostname is undefined - - ingress_info.resources[0].status.loadBalancer.ingress[0].ip is defined From c0d3c31be9e82642cde36b4b5001f049826725b2 Mon Sep 17 00:00:00 2001 From: Gerard Vanloo Date: Fri, 15 Aug 2025 10:37:55 -0400 Subject: [PATCH 15/35] fix: update smoke test vars Signed-off-by: Gerard Vanloo --- .github/workflows/sre-integration-smoke-tests.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/sre-integration-smoke-tests.yaml b/.github/workflows/sre-integration-smoke-tests.yaml index 9676bb28..ebb24444 100644 --- a/.github/workflows/sre-integration-smoke-tests.yaml +++ b/.github/workflows/sre-integration-smoke-tests.yaml @@ -117,7 +117,7 @@ jobs: - name: Create group vars run: | make -C sre group_vars - echo "tools: { jaeger: true }" > sre/group_vars/environment/tools.yaml + echo "tools: { jaeger: true, kubernetes_topology_monitor: false, prometheus: false }" > sre/group_vars/environment/tools.yaml - name: Install tools run: | make -C sre deploy_tools @@ -154,7 +154,7 @@ jobs: - name: Create group vars run: | make -C sre group_vars - echo "tools: { kubernetes_topology_monitor: true }" > sre/group_vars/environment/tools.yaml + echo "tools: { jaeger: false, kubernetes_topology_monitor: true, prometheus: false }" > sre/group_vars/environment/tools.yaml - name: Install tools run: | make -C sre deploy_tools @@ -191,7 +191,7 @@ jobs: - name: Create group vars run: | make -C sre group_vars - echo "tools: { prometheus: true }" > sre/group_vars/environment/tools.yaml + echo "tools: { jaeger: false, kubernetes_topology_monitor: false, prometheus: true }" > sre/group_vars/environment/tools.yaml - name: Install tools run: | make -C sre deploy_tools From 5b552beb88a7c636859ddbaf2e8e1757e2525533 Mon Sep 17 00:00:00 2001 From: Gerard Vanloo Date: Mon, 18 Aug 2025 18:05:08 -0400 Subject: [PATCH 16/35] chore: fix files after rebase Signed-off-by: Gerard Vanloo --- .../sre-integration-smoke-tests.yaml | 6 +++--- sre/playbooks/manage_incidents.yaml | 21 ++++--------------- sre/playbooks/manage_recorders.yaml | 9 ++++---- sre/roles/recorders/meta/argument_specs.yaml | 2 +- sre/roles/recorders/tasks/install.yaml | 6 +++--- .../tasks/install_alerts_recorders.yaml | 2 +- .../tasks/install_topology_recorders.yaml | 2 +- .../tasks/install_traces_recorders.yaml | 2 +- sre/roles/recorders/tasks/uninstall.yaml | 6 +++--- .../tasks/uninstall_alerts_recorders.yaml | 2 +- .../tasks/uninstall_topology_recorders.yaml | 2 +- .../tasks/uninstall_traces_recorders.yaml | 2 +- 12 files changed, 24 insertions(+), 38 deletions(-) diff --git a/.github/workflows/sre-integration-smoke-tests.yaml b/.github/workflows/sre-integration-smoke-tests.yaml index ebb24444..09fd66e8 100644 --- a/.github/workflows/sre-integration-smoke-tests.yaml +++ b/.github/workflows/sre-integration-smoke-tests.yaml @@ -117,7 +117,7 @@ jobs: - name: Create group vars run: | make -C sre group_vars - echo "tools: { jaeger: true, kubernetes_topology_monitor: false, prometheus: false }" > sre/group_vars/environment/tools.yaml + mv sre/tests/files/jaeger.yaml sre/group_vars/environment/tools.yaml - name: Install tools run: | make -C sre deploy_tools @@ -154,7 +154,7 @@ jobs: - name: Create group vars run: | make -C sre group_vars - echo "tools: { jaeger: false, kubernetes_topology_monitor: true, prometheus: false }" > sre/group_vars/environment/tools.yaml + mv sre/tests/files/kubernetes-topology-monitor.yaml sre/group_vars/environment/tools.yaml - name: Install tools run: | make -C sre deploy_tools @@ -191,7 +191,7 @@ jobs: - name: Create group vars run: | make -C sre group_vars - echo "tools: { jaeger: false, kubernetes_topology_monitor: false, prometheus: true }" > sre/group_vars/environment/tools.yaml + mv sre/tests/files/prometheus.yaml sre/group_vars/environment/tools.yaml - name: Install tools run: | make -C sre deploy_tools diff --git a/sre/playbooks/manage_incidents.yaml b/sre/playbooks/manage_incidents.yaml index 44b97ca4..65029392 100644 --- a/sre/playbooks/manage_incidents.yaml +++ b/sre/playbooks/manage_incidents.yaml @@ -40,26 +40,13 @@ incidents_file: id: "{{ incident_id }}" tasks: - - name: Import recorders role + - name: Import faults role ansible.builtin.import_role: - name: recorders + name: faults vars: - recorders_cluster: + faults_cluster: kubeconfig: "{{ cluster.kubeconfig }}" - platform: "{{ cluster_platform }}" - recorders_enabled: - alerts: - prometheus: "{{ tools_enabled.prometheus }}" - topology: - kubernetes: "{{ tools_enabled.kubernetes_topology_monitor }}" - - # - name: Import faults role - # ansible.builtin.import_role: - # name: faults - # vars: - # faults_cluster: - # kubeconfig: "{{ cluster.kubeconfig }}" - # faults_specs: "{{ incidents_spec.spec.faults }}" + faults_specs: "{{ incidents_spec.spec.faults }}" # - name: Import e2e role # ansible.builtin.import_role: diff --git a/sre/playbooks/manage_recorders.yaml b/sre/playbooks/manage_recorders.yaml index 0715f78c..b9f880a0 100644 --- a/sre/playbooks/manage_recorders.yaml +++ b/sre/playbooks/manage_recorders.yaml @@ -26,7 +26,6 @@ - name: Import variables for incident ansible.builtin.import_role: name: incidents - tasks_from: load tags: - always vars: @@ -42,10 +41,10 @@ recorders_cluster: kubeconfig: "{{ cluster.kubeconfig }}" platform: "{{ cluster_platform }}" - recorders_enabled: + recorders_required: alerts: - prometheus: "{{ tools_enabled.prometheus | default(tools.prometheus) }}" + prometheus: "{{ incidents_tools.prometheus | default(tools.prometheus) }}" topology: - kubernetes: "{{ tools_enabled.kubernetes_topology_monitor | default(tools.kubernetes_topology_monitor) }}" + kubernetes: "{{ incidents_tools.kubernetes_topology_monitor | default(tools.kubernetes_topology_monitor) }}" traces: - jaeger: "{{ tools_enabled.jaeger | default(tools.jaeger) }}" + jaeger: "{{ incidents_tools.jaeger | default(tools.jaeger) }}" diff --git a/sre/roles/recorders/meta/argument_specs.yaml b/sre/roles/recorders/meta/argument_specs.yaml index 0f42c3d3..49a7bf28 100644 --- a/sre/roles/recorders/meta/argument_specs.yaml +++ b/sre/roles/recorders/meta/argument_specs.yaml @@ -23,7 +23,7 @@ argument_specs: default: kubernetes required: false type: str - recorders_enabled: + recorders_required: required: false type: dict options: diff --git a/sre/roles/recorders/tasks/install.yaml b/sre/roles/recorders/tasks/install.yaml index 9362ad68..1681d453 100644 --- a/sre/roles/recorders/tasks/install.yaml +++ b/sre/roles/recorders/tasks/install.yaml @@ -13,16 +13,16 @@ ansible.builtin.import_tasks: file: install_alerts_recorders.yaml when: - - recorders_enabled.alerts is defined + - recorders_required.alerts is defined - name: Import topology recorder installation tasks ansible.builtin.import_tasks: file: install_topology_recorders.yaml when: - - recorders_enabled.topology is defined + - recorders_required.topology is defined - name: Import traces recorder installation tasks ansible.builtin.import_tasks: file: install_traces_recorders.yaml when: - - recorders_enabled.traces is defined + - recorders_required.traces is defined diff --git a/sre/roles/recorders/tasks/install_alerts_recorders.yaml b/sre/roles/recorders/tasks/install_alerts_recorders.yaml index e409249e..a2403642 100644 --- a/sre/roles/recorders/tasks/install_alerts_recorders.yaml +++ b/sre/roles/recorders/tasks/install_alerts_recorders.yaml @@ -3,4 +3,4 @@ ansible.builtin.import_tasks: file: install_alerts_recorders_prometheus.yaml when: - - recorders_enabled.alerts.prometheus | default(true) + - recorders_required.alerts.prometheus | default(true) diff --git a/sre/roles/recorders/tasks/install_topology_recorders.yaml b/sre/roles/recorders/tasks/install_topology_recorders.yaml index d7496a34..c2686fd8 100644 --- a/sre/roles/recorders/tasks/install_topology_recorders.yaml +++ b/sre/roles/recorders/tasks/install_topology_recorders.yaml @@ -3,4 +3,4 @@ ansible.builtin.import_tasks: file: install_topology_recorders_kubernetes.yaml when: - - recorders_enabled.topology.kubernetes | default(true) + - recorders_required.topology.kubernetes | default(true) diff --git a/sre/roles/recorders/tasks/install_traces_recorders.yaml b/sre/roles/recorders/tasks/install_traces_recorders.yaml index 74ea1384..6daabd9c 100644 --- a/sre/roles/recorders/tasks/install_traces_recorders.yaml +++ b/sre/roles/recorders/tasks/install_traces_recorders.yaml @@ -3,4 +3,4 @@ ansible.builtin.import_tasks: file: install_traces_recorders_jaeger.yaml when: - - recorders_enabled.traces.jaeger | default(true) + - recorders_required.traces.jaeger | default(true) diff --git a/sre/roles/recorders/tasks/uninstall.yaml b/sre/roles/recorders/tasks/uninstall.yaml index b4d1b814..f5799b25 100644 --- a/sre/roles/recorders/tasks/uninstall.yaml +++ b/sre/roles/recorders/tasks/uninstall.yaml @@ -3,19 +3,19 @@ ansible.builtin.import_tasks: file: uninstall_alerts_recorders.yaml when: - - recorders_enabled.alerts is defined + - recorders_required.alerts is defined - name: Import topology recorder uninstallation tasks ansible.builtin.import_tasks: file: uninstall_topology_recorders.yaml when: - - recorders_enabled.topology is defined + - recorders_required.topology is defined - name: Import traces recorder uninstallation tasks ansible.builtin.import_tasks: file: uninstall_traces_recorders.yaml when: - - recorders_enabled.traces is defined + - recorders_required.traces is defined - name: Delete the namespace kubernetes.core.k8s: diff --git a/sre/roles/recorders/tasks/uninstall_alerts_recorders.yaml b/sre/roles/recorders/tasks/uninstall_alerts_recorders.yaml index 438d0610..b3328721 100644 --- a/sre/roles/recorders/tasks/uninstall_alerts_recorders.yaml +++ b/sre/roles/recorders/tasks/uninstall_alerts_recorders.yaml @@ -3,4 +3,4 @@ ansible.builtin.import_tasks: file: uninstall_alerts_recorders_prometheus.yaml when: - - recorders_enabled.alerts.prometheus | default(true) + - recorders_required.alerts.prometheus | default(true) diff --git a/sre/roles/recorders/tasks/uninstall_topology_recorders.yaml b/sre/roles/recorders/tasks/uninstall_topology_recorders.yaml index 6a79b336..c1ab54f8 100644 --- a/sre/roles/recorders/tasks/uninstall_topology_recorders.yaml +++ b/sre/roles/recorders/tasks/uninstall_topology_recorders.yaml @@ -3,4 +3,4 @@ ansible.builtin.import_tasks: file: uninstall_topology_recorders_kubernetes.yaml when: - - recorders_enabled.topology.kubernetes | default(true) + - recorders_required.topology.kubernetes | default(true) diff --git a/sre/roles/recorders/tasks/uninstall_traces_recorders.yaml b/sre/roles/recorders/tasks/uninstall_traces_recorders.yaml index 549855b5..2e5fd291 100644 --- a/sre/roles/recorders/tasks/uninstall_traces_recorders.yaml +++ b/sre/roles/recorders/tasks/uninstall_traces_recorders.yaml @@ -3,4 +3,4 @@ ansible.builtin.import_tasks: file: uninstall_traces_recorders_jaeger.yaml when: - - recorders_enabled.traces.jaeger | default(true) + - recorders_required.traces.jaeger | default(true) From 8876315a635d02fb7eecf5826d5deb613f22416b Mon Sep 17 00:00:00 2001 From: Gerard Vanloo Date: Tue, 19 Aug 2025 10:35:27 -0400 Subject: [PATCH 17/35] chore: remove non-firing alerts Signed-off-by: Gerard Vanloo --- .../recorders/files/scripts/alerts/prometheus/gather.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sre/roles/recorders/files/scripts/alerts/prometheus/gather.py b/sre/roles/recorders/files/scripts/alerts/prometheus/gather.py index c2bfb577..a50f3aae 100644 --- a/sre/roles/recorders/files/scripts/alerts/prometheus/gather.py +++ b/sre/roles/recorders/files/scripts/alerts/prometheus/gather.py @@ -45,15 +45,18 @@ def main(): logger.warning("unable to query prometheus server") else: content = response.json() + alerts = content.get("data", {}).get("alerts", []) + firing_alerts = list(filter(lambda a: a.get("state", "") == "firing")) logger.info("retrieved {0} alerts from prometheus server".format(len(alerts))) + logger.info("retrieved {0} alerts are in firing state".format(len(firing_alerts))) utc_seconds = (datetime.now(timezone.utc) - datetime(1970, 1, 1, tzinfo=timezone.utc)).total_seconds() file_path = os.path.join(os.path.expanduser("~"), "records", "{0}-alerts.json".format(round(utc_seconds))) with open(file_path, "w") as f: - json.dump(alerts, f, indent=4) + json.dump(firing_alerts, f, indent=4) sleep_interval = (next_datetime - datetime.now()).total_seconds() if sleep_interval > 0: From dcda57da89dfeb4cc5a5cb51ce3ec5a7fb4e0820 Mon Sep 17 00:00:00 2001 From: Gerard Vanloo Date: Tue, 19 Aug 2025 10:44:38 -0400 Subject: [PATCH 18/35] chore: reorganize files Signed-off-by: Gerard Vanloo --- .github/dependabot.yml | 4 ++-- .../prometheus => alerts/prometheus/scripts}/gather.py | 0 .../prometheus/scripts}/requirements.txt | 0 .../prometheus.yaml => alerts/prometheus/statefulset.yaml} | 0 .../traces/jaeger => traces/jaeger/scripts}/gather.py | 0 .../jaeger => traces/jaeger/scripts}/requirements.txt | 0 .../traces/jaeger.yaml => traces/jaeger/statefulset.yaml} | 0 .../tasks/install_alerts_recorders_prometheus.yaml | 6 +++--- .../recorders/tasks/install_traces_recorders_jaeger.yaml | 6 +++--- .../tasks/uninstall_alerts_recorders_prometheus.yaml | 2 +- .../recorders/tasks/uninstall_traces_recorders_jaeger.yaml | 2 +- 11 files changed, 10 insertions(+), 10 deletions(-) rename sre/roles/recorders/files/{scripts/alerts/prometheus => alerts/prometheus/scripts}/gather.py (100%) rename sre/roles/recorders/files/{scripts/alerts/prometheus => alerts/prometheus/scripts}/requirements.txt (100%) rename sre/roles/recorders/files/{kubernetes/alerts/prometheus.yaml => alerts/prometheus/statefulset.yaml} (100%) rename sre/roles/recorders/files/{scripts/traces/jaeger => traces/jaeger/scripts}/gather.py (100%) rename sre/roles/recorders/files/{scripts/traces/jaeger => traces/jaeger/scripts}/requirements.txt (100%) rename sre/roles/recorders/files/{kubernetes/traces/jaeger.yaml => traces/jaeger/statefulset.yaml} (100%) diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 4f12279e..779503a9 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -20,7 +20,7 @@ updates: - package-ecosystem: "docker" directories: - - "sre/roles/recorders/files/kubernetes/" + - "sre/roles/recorders/files/**/**/" - "sre/roles/tools/files/kubernetes/" - "sre/tools/kubernetes-topology-monitor/" - "sre/tools/kubernetes-topology-monitor/charts/kubernetes-topology-monitor/templates/" @@ -45,7 +45,7 @@ updates: - "/" - "sre/" - "sre/dev/remote_cluster/" - - "sre/roles/recorders/files/scripts/**/" + - "sre/roles/recorders/files/**/**/scripts/" - "sre/tools/kubernetes-topology-monitor/" groups: pip-production-dependencies: diff --git a/sre/roles/recorders/files/scripts/alerts/prometheus/gather.py b/sre/roles/recorders/files/alerts/prometheus/scripts/gather.py similarity index 100% rename from sre/roles/recorders/files/scripts/alerts/prometheus/gather.py rename to sre/roles/recorders/files/alerts/prometheus/scripts/gather.py diff --git a/sre/roles/recorders/files/scripts/alerts/prometheus/requirements.txt b/sre/roles/recorders/files/alerts/prometheus/scripts/requirements.txt similarity index 100% rename from sre/roles/recorders/files/scripts/alerts/prometheus/requirements.txt rename to sre/roles/recorders/files/alerts/prometheus/scripts/requirements.txt diff --git a/sre/roles/recorders/files/kubernetes/alerts/prometheus.yaml b/sre/roles/recorders/files/alerts/prometheus/statefulset.yaml similarity index 100% rename from sre/roles/recorders/files/kubernetes/alerts/prometheus.yaml rename to sre/roles/recorders/files/alerts/prometheus/statefulset.yaml diff --git a/sre/roles/recorders/files/scripts/traces/jaeger/gather.py b/sre/roles/recorders/files/traces/jaeger/scripts/gather.py similarity index 100% rename from sre/roles/recorders/files/scripts/traces/jaeger/gather.py rename to sre/roles/recorders/files/traces/jaeger/scripts/gather.py diff --git a/sre/roles/recorders/files/scripts/traces/jaeger/requirements.txt b/sre/roles/recorders/files/traces/jaeger/scripts/requirements.txt similarity index 100% rename from sre/roles/recorders/files/scripts/traces/jaeger/requirements.txt rename to sre/roles/recorders/files/traces/jaeger/scripts/requirements.txt diff --git a/sre/roles/recorders/files/kubernetes/traces/jaeger.yaml b/sre/roles/recorders/files/traces/jaeger/statefulset.yaml similarity index 100% rename from sre/roles/recorders/files/kubernetes/traces/jaeger.yaml rename to sre/roles/recorders/files/traces/jaeger/statefulset.yaml diff --git a/sre/roles/recorders/tasks/install_alerts_recorders_prometheus.yaml b/sre/roles/recorders/tasks/install_alerts_recorders_prometheus.yaml index 70ee67b6..37fe4af6 100644 --- a/sre/roles/recorders/tasks/install_alerts_recorders_prometheus.yaml +++ b/sre/roles/recorders/tasks/install_alerts_recorders_prometheus.yaml @@ -34,15 +34,15 @@ name: alerts-recorder-prometheus-scripts namespace: "{{ recorders_namespace.name }}" data: - deps: "{{ lookup('ansible.builtin.file', 'files/scripts/alerts/prometheus/requirements.txt') }}" - script: "{{ lookup('ansible.builtin.file', 'files/scripts/alerts/prometheus/gather.py') }}" + deps: "{{ lookup('ansible.builtin.file', 'files/alerts/prometheus/scripts/requirements.txt') }}" + script: "{{ lookup('ansible.builtin.file', 'files/alerts/prometheus/scripts/gather.py') }}" state: present - name: Install Prometheus Alert Recorder kubernetes.core.k8s: kubeconfig: "{{ recorders_cluster.kubeconfig }}" namespace: "{{ recorders_namespace.name }}" - src: files/kubernetes/alerts/prometheus.yaml + src: files/alerts/prometheus/statefulset.yaml state: present - name: Create Prometheus Alert Recorder environment list diff --git a/sre/roles/recorders/tasks/install_traces_recorders_jaeger.yaml b/sre/roles/recorders/tasks/install_traces_recorders_jaeger.yaml index f8592e17..c6815efb 100644 --- a/sre/roles/recorders/tasks/install_traces_recorders_jaeger.yaml +++ b/sre/roles/recorders/tasks/install_traces_recorders_jaeger.yaml @@ -18,15 +18,15 @@ name: traces-recorder-jaeger-scripts namespace: "{{ recorders_namespace.name }}" data: - deps: "{{ lookup('ansible.builtin.file', 'files/scripts/traces/jaeger/requirements.txt') }}" - script: "{{ lookup('ansible.builtin.file', 'files/scripts/traces/jaeger/gather.py') }}" + deps: "{{ lookup('ansible.builtin.file', 'files/traces/jaeger/scripts/requirements.txt') }}" + script: "{{ lookup('ansible.builtin.file', 'files/traces/jaeger/scripts/gather.py') }}" state: present - name: Install Jaeger Traces Recorder kubernetes.core.k8s: kubeconfig: "{{ recorders_cluster.kubeconfig }}" namespace: "{{ recorders_namespace.name }}" - src: files/kubernetes/traces/jaeger.yaml + src: files/traces/jaeger/statefulset.yaml state: present - name: Update Jaeger Traces environment variables diff --git a/sre/roles/recorders/tasks/uninstall_alerts_recorders_prometheus.yaml b/sre/roles/recorders/tasks/uninstall_alerts_recorders_prometheus.yaml index 917031e2..d216f672 100644 --- a/sre/roles/recorders/tasks/uninstall_alerts_recorders_prometheus.yaml +++ b/sre/roles/recorders/tasks/uninstall_alerts_recorders_prometheus.yaml @@ -26,7 +26,7 @@ kubernetes.core.k8s: kubeconfig: "{{ recorders_cluster.kubeconfig }}" namespace: "{{ recorders_namespace.name }}" - src: files/kubernetes/alerts/prometheus.yaml + src: files/alerts/prometheus/statefulset.yaml state: absent wait: true diff --git a/sre/roles/recorders/tasks/uninstall_traces_recorders_jaeger.yaml b/sre/roles/recorders/tasks/uninstall_traces_recorders_jaeger.yaml index a559c5e3..39412577 100644 --- a/sre/roles/recorders/tasks/uninstall_traces_recorders_jaeger.yaml +++ b/sre/roles/recorders/tasks/uninstall_traces_recorders_jaeger.yaml @@ -26,7 +26,7 @@ kubernetes.core.k8s: kubeconfig: "{{ recorders_cluster.kubeconfig }}" namespace: "{{ recorders_namespace.name }}" - src: files/kubernetes/traces/jaeger.yaml + src: files/traces/jaeger/statefulset.yaml state: absent wait: true From 438f2cfb5871585bfa20bf0bdf89f87aca0cd2df Mon Sep 17 00:00:00 2001 From: Gerard Vanloo Date: Tue, 19 Aug 2025 18:36:35 -0400 Subject: [PATCH 19/35] feat: add local storage option for records Signed-off-by: Gerard Vanloo --- sre/playbooks/manage_recorders.yaml | 2 ++ sre/roles/recorders/meta/argument_specs.yaml | 11 +++++++++++ ...uninstall_alerts_recorders_prometheus.yaml | 19 +++++++++++++++++++ .../uninstall_traces_recorders_jaeger.yaml | 19 +++++++++++++++++++ 4 files changed, 51 insertions(+) diff --git a/sre/playbooks/manage_recorders.yaml b/sre/playbooks/manage_recorders.yaml index b9f880a0..17afceb3 100644 --- a/sre/playbooks/manage_recorders.yaml +++ b/sre/playbooks/manage_recorders.yaml @@ -48,3 +48,5 @@ kubernetes: "{{ incidents_tools.kubernetes_topology_monitor | default(tools.kubernetes_topology_monitor) }}" traces: jaeger: "{{ incidents_tools.jaeger | default(tools.jaeger) }}" + recorders_storage: + local: "{{ storage.local | default(omit) }}" diff --git a/sre/roles/recorders/meta/argument_specs.yaml b/sre/roles/recorders/meta/argument_specs.yaml index 49a7bf28..8a826880 100644 --- a/sre/roles/recorders/meta/argument_specs.yaml +++ b/sre/roles/recorders/meta/argument_specs.yaml @@ -51,3 +51,14 @@ argument_specs: default: true required: false type: bool + recorders_storage: + required: true + type: dict + options: + local: + required: false + type: dict + options: + directory: + required: true + type: str diff --git a/sre/roles/recorders/tasks/uninstall_alerts_recorders_prometheus.yaml b/sre/roles/recorders/tasks/uninstall_alerts_recorders_prometheus.yaml index d216f672..4cb4f5be 100644 --- a/sre/roles/recorders/tasks/uninstall_alerts_recorders_prometheus.yaml +++ b/sre/roles/recorders/tasks/uninstall_alerts_recorders_prometheus.yaml @@ -68,3 +68,22 @@ - recorders_cluster.platform == "openshift" - recorders_prometheus_secret_info is defined - recorders_prometheus_secret_info.resources | length == 1 + +- name: Find all exported JSON files + ansible.builtin.find: + path: /tmp/alerts + patterns: + - "*.json" + register: recorders_files + +- name: Copy exported data into local directory + ansible.builtin.copy: + dest: "{{ recorders_storage.local.directory }}/{{ file.path | basename }}" + mode: "0644" + src: "{{ file.path }}" + loop: "{{ recorders_files.files }}" + loop_control: + label: file/{{ file.path | basename }} + loop_var: file + when: + - recorders_storage.local is defined diff --git a/sre/roles/recorders/tasks/uninstall_traces_recorders_jaeger.yaml b/sre/roles/recorders/tasks/uninstall_traces_recorders_jaeger.yaml index 39412577..4971adf0 100644 --- a/sre/roles/recorders/tasks/uninstall_traces_recorders_jaeger.yaml +++ b/sre/roles/recorders/tasks/uninstall_traces_recorders_jaeger.yaml @@ -41,3 +41,22 @@ namespace: "{{ recorders_namespace.name }}" state: absent wait: true + +- name: Find all exported JSON files + ansible.builtin.find: + path: /tmp/traces + patterns: + - "*.json" + register: recorders_files + +- name: Copy exported data into local directory + ansible.builtin.copy: + dest: "{{ recorders_storage.local.directory }}/{{ file.path | basename }}" + mode: "0644" + src: "{{ file.path }}" + loop: "{{ recorders_files.files }}" + loop_control: + label: file/{{ file.path | basename }} + loop_var: file + when: + - recorders_storage.local is defined From adcdcbd0aba2a361fb1de2335fb245b969dfe976 Mon Sep 17 00:00:00 2001 From: Gerard Vanloo Date: Tue, 19 Aug 2025 18:51:37 -0400 Subject: [PATCH 20/35] chore: sleep after downloading nodes for 30 minutes Signed-off-by: Gerard Vanloo --- .../scripts/topology/kubernetes/gather.py | 60 ------------------- .../topology/kubernetes/scripts/gather.py | 55 +++++++++++++++++ .../kubernetes/scripts}/requirements.txt | 0 .../kubernetes/statefulset.yaml} | 0 ...install_topology_recorders_kubernetes.yaml | 6 +- ...install_topology_recorders_kubernetes.yaml | 21 ++++++- 6 files changed, 78 insertions(+), 64 deletions(-) delete mode 100644 sre/roles/recorders/files/scripts/topology/kubernetes/gather.py create mode 100644 sre/roles/recorders/files/topology/kubernetes/scripts/gather.py rename sre/roles/recorders/files/{scripts/topology/kubernetes => topology/kubernetes/scripts}/requirements.txt (100%) rename sre/roles/recorders/files/{kubernetes/topology/kubernetes.yaml => topology/kubernetes/statefulset.yaml} (100%) diff --git a/sre/roles/recorders/files/scripts/topology/kubernetes/gather.py b/sre/roles/recorders/files/scripts/topology/kubernetes/gather.py deleted file mode 100644 index 5d1ccfc8..00000000 --- a/sre/roles/recorders/files/scripts/topology/kubernetes/gather.py +++ /dev/null @@ -1,60 +0,0 @@ -import datetime -import json -import logging -import os -import sys -import time - -from datetime import datetime, timedelta, timezone - -import requests - -from requests.adapters import HTTPAdapter -from urllib3.util import Retry - -# Logging -logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) - -logger = logging.getLogger(__name__) - - -def main(): - endpoint = os.environ.get("KUBERNETES_TOPOLOGY_MONITOR_ENDPOINT") - if endpoint is None: - sys.exit("error: KUBERNETES_TOPOLOGY_MONITOR_ENDPOINT environment variable is not set") - - headers = { "Content-Type": "application/json" } - - retries = Retry(total=3, backoff_factor=0.5) - adapter = HTTPAdapter(max_retries=retries) - - session = requests.Session() - session.mount("http://", adapter) - session.mount("https://", adapter) - - while True: - next_datetime = datetime.now() + timedelta(seconds=180) - - for item in ["nodes", "edges", "graph", "events"]: - response = session.get("{0}/{1}".format(endpoint, item), headers=headers, verify=True) - - if response.status_code != 200: - logger.warning("unable to query kubernetes topology mapper for {0}".format(item)) - else: - content = response.json() - - logger.info("retrieved {0} data".format(item)) - - utc_seconds = (datetime.now(timezone.utc) - datetime(1970, 1, 1, tzinfo=timezone.utc)).total_seconds() - file_path = os.path.join(os.path.expanduser("~"), "records", "{0}-{1}.json".format(round(utc_seconds), item)) - - with open(file_path, "w") as f: - json.dump(content, f, indent=4) - - sleep_interval = (next_datetime - datetime.now()).total_seconds() - if sleep_interval > 0: - logger.debug("sleep for {0} seconds".format(sleep_interval)) - time.sleep(sleep_interval) - -if __name__ == "__main__": - main() diff --git a/sre/roles/recorders/files/topology/kubernetes/scripts/gather.py b/sre/roles/recorders/files/topology/kubernetes/scripts/gather.py new file mode 100644 index 00000000..b5992810 --- /dev/null +++ b/sre/roles/recorders/files/topology/kubernetes/scripts/gather.py @@ -0,0 +1,55 @@ +import datetime +import json +import logging +import os +import sys +import time + +from datetime import datetime, timedelta, timezone + +import requests + +from requests.adapters import HTTPAdapter +from urllib3.util import Retry + +# Logging +logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) + +logger = logging.getLogger(__name__) + + +def main(): + endpoint = os.environ.get("KUBERNETES_TOPOLOGY_MONITOR_ENDPOINT") + if endpoint is None: + sys.exit("error: KUBERNETES_TOPOLOGY_MONITOR_ENDPOINT environment variable is not set") + + headers = { "Content-Type": "application/json" } + + retries = Retry(total=3, backoff_factor=0.5) + adapter = HTTPAdapter(max_retries=retries) + + session = requests.Session() + session.mount("http://", adapter) + session.mount("https://", adapter) + + for item in ["nodes", "edges", "graph", "events"]: + response = session.get("{0}/{1}".format(endpoint, item), headers=headers, verify=True) + + if response.status_code != 200: + logger.warning("unable to query kubernetes topology mapper for {0}".format(item)) + else: + content = response.json() + + logger.info("retrieved {0} data".format(item)) + + utc_seconds = (datetime.now(timezone.utc) - datetime(1970, 1, 1, tzinfo=timezone.utc)).total_seconds() + file_path = os.path.join(os.path.expanduser("~"), "records", "{0}-{1}.json".format(round(utc_seconds), item)) + + with open(file_path, "w") as f: + json.dump(content, f, indent=4) + + logger.debug("download complete. begin sleeping.") + time.sleep(1800) + +if __name__ == "__main__": + main() diff --git a/sre/roles/recorders/files/scripts/topology/kubernetes/requirements.txt b/sre/roles/recorders/files/topology/kubernetes/scripts/requirements.txt similarity index 100% rename from sre/roles/recorders/files/scripts/topology/kubernetes/requirements.txt rename to sre/roles/recorders/files/topology/kubernetes/scripts/requirements.txt diff --git a/sre/roles/recorders/files/kubernetes/topology/kubernetes.yaml b/sre/roles/recorders/files/topology/kubernetes/statefulset.yaml similarity index 100% rename from sre/roles/recorders/files/kubernetes/topology/kubernetes.yaml rename to sre/roles/recorders/files/topology/kubernetes/statefulset.yaml diff --git a/sre/roles/recorders/tasks/install_topology_recorders_kubernetes.yaml b/sre/roles/recorders/tasks/install_topology_recorders_kubernetes.yaml index 954eef72..8c1df412 100644 --- a/sre/roles/recorders/tasks/install_topology_recorders_kubernetes.yaml +++ b/sre/roles/recorders/tasks/install_topology_recorders_kubernetes.yaml @@ -18,15 +18,15 @@ name: topology-recorder-kubernetes-scripts namespace: "{{ recorders_namespace.name }}" data: - deps: "{{ lookup('ansible.builtin.file', 'files/scripts/topology/kubernetes/requirements.txt') }}" - script: "{{ lookup('ansible.builtin.file', 'files/scripts/topology/kubernetes/gather.py') }}" + deps: "{{ lookup('ansible.builtin.file', 'files/topology/kubernetes/scripts/requirements.txt') }}" + script: "{{ lookup('ansible.builtin.file', 'files/topology/kubernetes/scripts/gather.py') }}" state: present - name: Install Kubernetes Topology Recorder kubernetes.core.k8s: kubeconfig: "{{ recorders_cluster.kubeconfig }}" namespace: "{{ recorders_namespace.name }}" - src: files/kubernetes/topology/kubernetes.yaml + src: files/topology/kubernetes/statefulset.yaml state: present - name: Update Kubernetes Topology Recorder environment variables diff --git a/sre/roles/recorders/tasks/uninstall_topology_recorders_kubernetes.yaml b/sre/roles/recorders/tasks/uninstall_topology_recorders_kubernetes.yaml index fa9662e2..5258e183 100644 --- a/sre/roles/recorders/tasks/uninstall_topology_recorders_kubernetes.yaml +++ b/sre/roles/recorders/tasks/uninstall_topology_recorders_kubernetes.yaml @@ -26,7 +26,7 @@ kubernetes.core.k8s: kubeconfig: "{{ recorders_cluster.kubeconfig }}" namespace: "{{ recorders_namespace.name }}" - src: files/kubernetes/topology/kubernetes.yaml + src: files/alerts/prometheus/statefulset.yaml state: absent wait: true @@ -41,3 +41,22 @@ namespace: "{{ recorders_namespace.name }}" state: absent wait: true + +- name: Find all exported JSON files + ansible.builtin.find: + path: /tmp/topology + patterns: + - "*.json" + register: recorders_files + +- name: Copy exported data into local directory + ansible.builtin.copy: + dest: "{{ recorders_storage.local.directory }}/{{ file.path | basename }}" + mode: "0644" + src: "{{ file.path }}" + loop: "{{ recorders_files.files }}" + loop_control: + label: file/{{ file.path | basename }} + loop_var: file + when: + - recorders_storage.local is defined From 3b174d2f5d0e5ca607dc0a280065448b093c6e90 Mon Sep 17 00:00:00 2001 From: Gerard Vanloo Date: Wed, 20 Aug 2025 10:09:51 -0400 Subject: [PATCH 21/35] style: replace configmap with template Signed-off-by: Gerard Vanloo --- .../install_alerts_recorders_prometheus.yaml | 28 +++++++------------ ...install_topology_recorders_kubernetes.yaml | 14 ++++------ .../install_traces_recorders_jaeger.yaml | 14 ++++------ .../templates/alerts/prometheus/configmap.j2 | 8 ++++++ .../templates/alerts/prometheus/secret.j2 | 7 +++++ .../topology/kubernetes/configmap.j2 | 8 ++++++ .../templates/traces/jaeger/configmap.j2 | 8 ++++++ 7 files changed, 51 insertions(+), 36 deletions(-) create mode 100644 sre/roles/recorders/templates/alerts/prometheus/configmap.j2 create mode 100644 sre/roles/recorders/templates/alerts/prometheus/secret.j2 create mode 100644 sre/roles/recorders/templates/topology/kubernetes/configmap.j2 create mode 100644 sre/roles/recorders/templates/traces/jaeger/configmap.j2 diff --git a/sre/roles/recorders/tasks/install_alerts_recorders_prometheus.yaml b/sre/roles/recorders/tasks/install_alerts_recorders_prometheus.yaml index 37fe4af6..319f67b2 100644 --- a/sre/roles/recorders/tasks/install_alerts_recorders_prometheus.yaml +++ b/sre/roles/recorders/tasks/install_alerts_recorders_prometheus.yaml @@ -8,18 +8,14 @@ kubeconfig: "{{ recorders_cluster.kubeconfig }}" platform: "{{ recorders_cluster.platform }}" -- name: Create Secret with Prometheus bearer token +- name: Create ConfigMap with Python script kubernetes.core.k8s: kubeconfig: "{{ recorders_cluster.kubeconfig }}" - resource_definition: - apiVersion: v1 - kind: Secret - metadata: - name: alerts-recorder-prometheus-token - namespace: "{{ recorders_namespace.name }}" - data: - token: "{{ tools_prometheus_bearer_token }}" + namespace: "{{ recorders_namespace.name }}" + template: templates/alerts/prometheus/secret.j2 state: present + vars: + prometheus_bearer_token: "{{ tools_prometheus_bearer_token }}" when: - recorders_cluster.platform == "openshift" - tools_prometheus_bearer_token is defined @@ -27,16 +23,12 @@ - name: Create ConfigMap with Python script kubernetes.core.k8s: kubeconfig: "{{ recorders_cluster.kubeconfig }}" - resource_definition: - apiVersion: v1 - kind: ConfigMap - metadata: - name: alerts-recorder-prometheus-scripts - namespace: "{{ recorders_namespace.name }}" - data: - deps: "{{ lookup('ansible.builtin.file', 'files/alerts/prometheus/scripts/requirements.txt') }}" - script: "{{ lookup('ansible.builtin.file', 'files/alerts/prometheus/scripts/gather.py') }}" + namespace: "{{ recorders_namespace.name }}" + template: templates/alerts/prometheus/configmap.j2 state: present + vars: + python_script_file_contents: "{{ lookup('ansible.builtin.file', 'files/alerts/prometheus/scripts/gather.py') }}" + requirements_file_contents: "{{ lookup('ansible.builtin.file', 'files/alerts/prometheus/scripts/requirements.txt') }}" - name: Install Prometheus Alert Recorder kubernetes.core.k8s: diff --git a/sre/roles/recorders/tasks/install_topology_recorders_kubernetes.yaml b/sre/roles/recorders/tasks/install_topology_recorders_kubernetes.yaml index 8c1df412..30ff2f3b 100644 --- a/sre/roles/recorders/tasks/install_topology_recorders_kubernetes.yaml +++ b/sre/roles/recorders/tasks/install_topology_recorders_kubernetes.yaml @@ -11,16 +11,12 @@ - name: Create ConfigMap with Python script kubernetes.core.k8s: kubeconfig: "{{ recorders_cluster.kubeconfig }}" - resource_definition: - apiVersion: v1 - kind: ConfigMap - metadata: - name: topology-recorder-kubernetes-scripts - namespace: "{{ recorders_namespace.name }}" - data: - deps: "{{ lookup('ansible.builtin.file', 'files/topology/kubernetes/scripts/requirements.txt') }}" - script: "{{ lookup('ansible.builtin.file', 'files/topology/kubernetes/scripts/gather.py') }}" + namespace: "{{ recorders_namespace.name }}" + template: templates/topology/kubernetes/configmap.j2 state: present + vars: + python_script_file_contents: "{{ lookup('ansible.builtin.file', 'files/topology/kubernetes/scripts/gather.py') }}" + requirements_file_contents: "{{ lookup('ansible.builtin.file', 'files/topology/kubernetes/scripts/requirements.txt') }}" - name: Install Kubernetes Topology Recorder kubernetes.core.k8s: diff --git a/sre/roles/recorders/tasks/install_traces_recorders_jaeger.yaml b/sre/roles/recorders/tasks/install_traces_recorders_jaeger.yaml index c6815efb..3c04e700 100644 --- a/sre/roles/recorders/tasks/install_traces_recorders_jaeger.yaml +++ b/sre/roles/recorders/tasks/install_traces_recorders_jaeger.yaml @@ -11,16 +11,12 @@ - name: Create ConfigMap with Python script kubernetes.core.k8s: kubeconfig: "{{ recorders_cluster.kubeconfig }}" - resource_definition: - apiVersion: v1 - kind: ConfigMap - metadata: - name: traces-recorder-jaeger-scripts - namespace: "{{ recorders_namespace.name }}" - data: - deps: "{{ lookup('ansible.builtin.file', 'files/traces/jaeger/scripts/requirements.txt') }}" - script: "{{ lookup('ansible.builtin.file', 'files/traces/jaeger/scripts/gather.py') }}" + namespace: "{{ recorders_namespace.name }}" + template: templates/traces/jaeger/configmap.j2 state: present + vars: + python_script_file_contents: "{{ lookup('ansible.builtin.file', 'files/traces/jaeger/scripts/gather.py') }}" + requirements_file_contents: "{{ lookup('ansible.builtin.file', 'files/traces/jaeger/scripts/requirements.txt') }}" - name: Install Jaeger Traces Recorder kubernetes.core.k8s: diff --git a/sre/roles/recorders/templates/alerts/prometheus/configmap.j2 b/sre/roles/recorders/templates/alerts/prometheus/configmap.j2 new file mode 100644 index 00000000..dcd0cbba --- /dev/null +++ b/sre/roles/recorders/templates/alerts/prometheus/configmap.j2 @@ -0,0 +1,8 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: alerts-recorder-prometheus-scripts +data: + deps: "{{ requirements_file_contents }}" + script: "{{ python_script_file_contents }}" diff --git a/sre/roles/recorders/templates/alerts/prometheus/secret.j2 b/sre/roles/recorders/templates/alerts/prometheus/secret.j2 new file mode 100644 index 00000000..d9aa35a2 --- /dev/null +++ b/sre/roles/recorders/templates/alerts/prometheus/secret.j2 @@ -0,0 +1,7 @@ +--- +apiVersion: v1 +kind: Secret +metadata: + name: alerts-recorder-prometheus-token +data: + token: "{{ prometheus_bearer_token }}" diff --git a/sre/roles/recorders/templates/topology/kubernetes/configmap.j2 b/sre/roles/recorders/templates/topology/kubernetes/configmap.j2 new file mode 100644 index 00000000..58aac8ff --- /dev/null +++ b/sre/roles/recorders/templates/topology/kubernetes/configmap.j2 @@ -0,0 +1,8 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: topology-recorder-kubernetes-scripts +data: + deps: "{{ requirements_file_contents }}" + script: "{{ python_script_file_contents }}" diff --git a/sre/roles/recorders/templates/traces/jaeger/configmap.j2 b/sre/roles/recorders/templates/traces/jaeger/configmap.j2 new file mode 100644 index 00000000..65dd4bf5 --- /dev/null +++ b/sre/roles/recorders/templates/traces/jaeger/configmap.j2 @@ -0,0 +1,8 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: traces-recorder-jaeger-scripts +data: + deps: "{{ requirements_file_contents }}" + script: "{{ python_script_file_contents }}" From b93bacfbdf625a1ec4feac0e73daeddc7cac1fba Mon Sep 17 00:00:00 2001 From: Gerard Vanloo Date: Wed, 20 Aug 2025 10:39:04 -0400 Subject: [PATCH 22/35] style: use combination of templates and files for object creation Signed-off-by: Gerard Vanloo --- .../files/alerts/prometheus/statefulset.yaml | 64 +--------------- .../topology/kubernetes/statefulset.yaml | 64 +--------------- .../files/traces/jaeger/statefulset.yaml | 64 +--------------- .../install_alerts_recorders_prometheus.yaml | 74 ++++++++----------- ...install_topology_recorders_kubernetes.yaml | 32 +++----- .../install_traces_recorders_jaeger.yaml | 32 +++----- ...uninstall_alerts_recorders_prometheus.yaml | 39 ++++------ ...install_topology_recorders_kubernetes.yaml | 13 ++-- .../uninstall_traces_recorders_jaeger.yaml | 13 ++-- .../templates/alerts/prometheus/configmap.j2 | 3 + .../templates/alerts/prometheus/secret.j2 | 3 + .../alerts/prometheus/statefulset.j2 | 72 ++++++++++++++++++ .../topology/kubernetes/configmap.j2 | 3 + .../topology/kubernetes/statefulset.j2 | 72 ++++++++++++++++++ .../templates/traces/jaeger/configmap.j2 | 3 + .../templates/traces/jaeger/statefulset.j2 | 72 ++++++++++++++++++ 16 files changed, 319 insertions(+), 304 deletions(-) create mode 100644 sre/roles/recorders/templates/alerts/prometheus/statefulset.j2 create mode 100644 sre/roles/recorders/templates/topology/kubernetes/statefulset.j2 create mode 100644 sre/roles/recorders/templates/traces/jaeger/statefulset.j2 diff --git a/sre/roles/recorders/files/alerts/prometheus/statefulset.yaml b/sre/roles/recorders/files/alerts/prometheus/statefulset.yaml index 7613c1fd..c7f5d164 100644 --- a/sre/roles/recorders/files/alerts/prometheus/statefulset.yaml +++ b/sre/roles/recorders/files/alerts/prometheus/statefulset.yaml @@ -1,71 +1,15 @@ --- +# This definition has been left purposely incomplete. This allows +# Dependabot to track the image. The full definition is provided in the +# template version of this object. + apiVersion: apps/v1 kind: StatefulSet metadata: - labels: - app.kubernetes.io/name: prometheus-alert-recorder - app.kubernetes.io/part-of: it-bench name: prometheus-alert-recorder spec: - selector: - matchLabels: - app.kubernetes.io/name: prometheus-alert-recorder - app.kubernetes.io/part-of: it-bench template: - metadata: - annotations: - openshift.io/required-scc: restricted-v2 - labels: - app.kubernetes.io/name: prometheus-alert-recorder - app.kubernetes.io/part-of: it-bench spec: containers: - name: recorder image: registry.access.redhat.com/ubi9/python-312:9.6-1754326132 - command: - - /bin/sh - args: - - -c - - "python3.12 -m pip install -r ~/deps/requirements.txt && python3.12 ~/scripts/gather.py" - resources: - requests: - cpu: 100m - memory: 125Mi - limits: - memory: 250Mi - volumeMounts: - - name: dependencies - mountPath: /opt/app-root/src/deps - readOnly: true - - name: scripts - mountPath: /opt/app-root/src/scripts - readOnly: true - - name: prometheus-alert-records - mountPath: /opt/app-root/src/records - securityContext: - fsGroup: 1001 - volumes: - - name: scripts - configMap: - name: alerts-recorder-prometheus-scripts - items: - - key: script - path: gather.py - - name: dependencies - configMap: - name: alerts-recorder-prometheus-scripts - items: - - key: deps - path: requirements.txt - replicas: 1 - volumeClaimTemplates: - - metadata: - name: prometheus-alert-records - spec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: 1Gi - persistentVolumeClaimRetentionPolicy: - whenDeleted: Delete diff --git a/sre/roles/recorders/files/topology/kubernetes/statefulset.yaml b/sre/roles/recorders/files/topology/kubernetes/statefulset.yaml index 7393231c..5bc17cac 100644 --- a/sre/roles/recorders/files/topology/kubernetes/statefulset.yaml +++ b/sre/roles/recorders/files/topology/kubernetes/statefulset.yaml @@ -1,71 +1,15 @@ --- +# This definition has been left purposely incomplete. This allows +# Dependabot to track the image. The full definition is provided in the +# template version of this object. + apiVersion: apps/v1 kind: StatefulSet metadata: - labels: - app.kubernetes.io/name: kubernetes-topology-recorder - app.kubernetes.io/part-of: it-bench name: kubernetes-topology-recorder spec: - selector: - matchLabels: - app.kubernetes.io/name: kubernetes-topology-recorder - app.kubernetes.io/part-of: it-bench template: - metadata: - annotations: - openshift.io/required-scc: restricted-v2 - labels: - app.kubernetes.io/name: kubernetes-topology-recorder - app.kubernetes.io/part-of: it-bench spec: containers: - name: recorder image: registry.access.redhat.com/ubi9/python-312:9.6-1754326132 - command: - - /bin/sh - args: - - -c - - "python3.12 -m pip install -r ~/deps/requirements.txt && python3.12 ~/scripts/gather.py" - resources: - requests: - cpu: 100m - memory: 125Mi - limits: - memory: 250Mi - volumeMounts: - - name: dependencies - mountPath: /opt/app-root/src/deps - readOnly: true - - name: scripts - mountPath: /opt/app-root/src/scripts - readOnly: true - - name: kubernetes-topology-records - mountPath: /opt/app-root/src/records - securityContext: - fsGroup: 1001 - volumes: - - name: scripts - configMap: - name: topology-recorder-kubernetes-scripts - items: - - key: script - path: gather.py - - name: dependencies - configMap: - name: topology-recorder-kubernetes-scripts - items: - - key: deps - path: requirements.txt - replicas: 1 - volumeClaimTemplates: - - metadata: - name: kubernetes-topology-records - spec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: 5Gi - persistentVolumeClaimRetentionPolicy: - whenDeleted: Delete diff --git a/sre/roles/recorders/files/traces/jaeger/statefulset.yaml b/sre/roles/recorders/files/traces/jaeger/statefulset.yaml index 53167db3..a9a4725b 100644 --- a/sre/roles/recorders/files/traces/jaeger/statefulset.yaml +++ b/sre/roles/recorders/files/traces/jaeger/statefulset.yaml @@ -1,71 +1,15 @@ --- +# This definition has been left purposely incomplete. This allows +# Dependabot to track the image. The full definition is provided in the +# template version of this object. + apiVersion: apps/v1 kind: StatefulSet metadata: - labels: - app.kubernetes.io/name: jaeger-traces-recorder - app.kubernetes.io/part-of: it-bench name: jaeger-traces-recorder spec: - selector: - matchLabels: - app.kubernetes.io/name: jaeger-traces-recorder - app.kubernetes.io/part-of: it-bench template: - metadata: - annotations: - openshift.io/required-scc: restricted-v2 - labels: - app.kubernetes.io/name: jaeger-traces-recorder - app.kubernetes.io/part-of: it-bench spec: containers: - name: recorder image: registry.access.redhat.com/ubi9/python-312:9.6-1754326132 - command: - - /bin/sh - args: - - -c - - "python3.12 -m pip install -r ~/deps/requirements.txt && python3.12 ~/scripts/gather.py" - resources: - requests: - cpu: 100m - memory: 125Mi - limits: - memory: 250Mi - volumeMounts: - - name: dependencies - mountPath: /opt/app-root/src/deps - readOnly: true - - name: scripts - mountPath: /opt/app-root/src/scripts - readOnly: true - - name: jaeger-trace-records - mountPath: /opt/app-root/src/records - securityContext: - fsGroup: 1001 - volumes: - - name: scripts - configMap: - name: traces-recorder-jaeger-scripts - items: - - key: script - path: gather.py - - name: dependencies - configMap: - name: traces-recorder-jaeger-scripts - items: - - key: deps - path: requirements.txt - replicas: 1 - volumeClaimTemplates: - - metadata: - name: jaeger-trace-records - spec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: 5Gi - persistentVolumeClaimRetentionPolicy: - whenDeleted: Delete diff --git a/sre/roles/recorders/tasks/install_alerts_recorders_prometheus.yaml b/sre/roles/recorders/tasks/install_alerts_recorders_prometheus.yaml index 319f67b2..f40707f1 100644 --- a/sre/roles/recorders/tasks/install_alerts_recorders_prometheus.yaml +++ b/sre/roles/recorders/tasks/install_alerts_recorders_prometheus.yaml @@ -8,34 +8,9 @@ kubeconfig: "{{ recorders_cluster.kubeconfig }}" platform: "{{ recorders_cluster.platform }}" -- name: Create ConfigMap with Python script - kubernetes.core.k8s: - kubeconfig: "{{ recorders_cluster.kubeconfig }}" - namespace: "{{ recorders_namespace.name }}" - template: templates/alerts/prometheus/secret.j2 - state: present - vars: - prometheus_bearer_token: "{{ tools_prometheus_bearer_token }}" - when: - - recorders_cluster.platform == "openshift" - - tools_prometheus_bearer_token is defined - -- name: Create ConfigMap with Python script - kubernetes.core.k8s: - kubeconfig: "{{ recorders_cluster.kubeconfig }}" - namespace: "{{ recorders_namespace.name }}" - template: templates/alerts/prometheus/configmap.j2 - state: present - vars: - python_script_file_contents: "{{ lookup('ansible.builtin.file', 'files/alerts/prometheus/scripts/gather.py') }}" - requirements_file_contents: "{{ lookup('ansible.builtin.file', 'files/alerts/prometheus/scripts/requirements.txt') }}" - -- name: Install Prometheus Alert Recorder - kubernetes.core.k8s: - kubeconfig: "{{ recorders_cluster.kubeconfig }}" - namespace: "{{ recorders_namespace.name }}" - src: files/alerts/prometheus/statefulset.yaml - state: present +- name: Load statefulset information + ansible.builtin.set_fact: + recorders_statefulset: "{{ lookup('ansible.builtin.file', 'files/topology/kubernetes/statefulset.yaml') | from_yaml }}" - name: Create Prometheus Alert Recorder environment list ansible.builtin.set_fact: @@ -65,24 +40,37 @@ - tools_prometheus_bearer_token is defined - recorders_prometheus_env_vars is defined -- name: Update Prometheus Alert Recorder environment variables +- name: Create Secret with bearer token kubernetes.core.k8s: kubeconfig: "{{ recorders_cluster.kubeconfig }}" - resource_definition: - apiVersion: apps/v1 - kind: StatefulSet - metadata: - name: prometheus-alert-recorder - namespace: "{{ recorders_namespace.name }}" - spec: - template: - spec: - containers: - - name: recorder - env: "{{ recorders_prometheus_env_vars }}" - state: patched + namespace: "{{ recorders_namespace.name }}" + template: templates/alerts/prometheus/secret.j2 + state: present + vars: + prometheus_bearer_token: "{{ tools_prometheus_bearer_token }}" when: - - recorders_prometheus_env_vars is defined + - recorders_cluster.platform == "openshift" + - tools_prometheus_bearer_token is defined + +- name: Create ConfigMap with Python script + kubernetes.core.k8s: + kubeconfig: "{{ recorders_cluster.kubeconfig }}" + namespace: "{{ recorders_namespace.name }}" + template: templates/alerts/prometheus/configmap.j2 + state: present + vars: + python_script_file_contents: "{{ lookup('ansible.builtin.file', 'files/alerts/prometheus/scripts/gather.py') }}" + requirements_file_contents: "{{ lookup('ansible.builtin.file', 'files/alerts/prometheus/scripts/requirements.txt') }}" + +- name: Install Prometheus Alert Recorder + kubernetes.core.k8s: + kubeconfig: "{{ recorders_cluster.kubeconfig }}" + namespace: "{{ recorders_namespace.name }}" + template: templates/alerts/prometheus/statefulset.j2 + state: present + vars: + container_image: "{{ recorders_statefulset.spec.template.spec.containers[0].image }}" + container_environment_variables: "{{ recorders_prometheus_env_vars }}" - name: Wait for workload to update kubernetes.core.k8s_info: diff --git a/sre/roles/recorders/tasks/install_topology_recorders_kubernetes.yaml b/sre/roles/recorders/tasks/install_topology_recorders_kubernetes.yaml index 30ff2f3b..1b9b7376 100644 --- a/sre/roles/recorders/tasks/install_topology_recorders_kubernetes.yaml +++ b/sre/roles/recorders/tasks/install_topology_recorders_kubernetes.yaml @@ -8,6 +8,10 @@ kubeconfig: "{{ recorders_cluster.kubeconfig }}" platform: "{{ recorders_cluster.platform }}" +- name: Load statefulset information + ansible.builtin.set_fact: + recorders_statefulset: "{{ lookup('ansible.builtin.file', 'files/topology/kubernetes/statefulset.yaml') | from_yaml }}" + - name: Create ConfigMap with Python script kubernetes.core.k8s: kubeconfig: "{{ recorders_cluster.kubeconfig }}" @@ -22,29 +26,13 @@ kubernetes.core.k8s: kubeconfig: "{{ recorders_cluster.kubeconfig }}" namespace: "{{ recorders_namespace.name }}" - src: files/topology/kubernetes/statefulset.yaml + template: templates/topology/kubernetes/statefulset.j2 state: present - -- name: Update Kubernetes Topology Recorder environment variables - kubernetes.core.k8s: - kubeconfig: "{{ recorders_cluster.kubeconfig }}" - resource_definition: - apiVersion: apps/v1 - kind: StatefulSet - metadata: - name: kubernetes-topology-recorder - namespace: "{{ recorders_namespace.name }}" - spec: - template: - spec: - containers: - - name: recorder - env: - - name: KUBERNETES_TOPOLOGY_MONITOR_ENDPOINT - value: "{{ tools_kubernetes_topology_mapper_endpoint }}" - state: patched - when: - - tools_kubernetes_topology_mapper_endpoint is defined + vars: + container_image: "{{ recorders_statefulset.spec.template.spec.containers[0].image }}" + container_environment_variables: + - name: KUBERNETES_TOPOLOGY_MONITOR_ENDPOINT + value: "{{ tools_kubernetes_topology_mapper_endpoint }}" - name: Wait for workload to update kubernetes.core.k8s_info: diff --git a/sre/roles/recorders/tasks/install_traces_recorders_jaeger.yaml b/sre/roles/recorders/tasks/install_traces_recorders_jaeger.yaml index 3c04e700..46ef840e 100644 --- a/sre/roles/recorders/tasks/install_traces_recorders_jaeger.yaml +++ b/sre/roles/recorders/tasks/install_traces_recorders_jaeger.yaml @@ -8,6 +8,10 @@ kubeconfig: "{{ recorders_cluster.kubeconfig }}" platform: "{{ recorders_cluster.platform }}" +- name: Load statefulset information + ansible.builtin.set_fact: + recorders_statefulset: "{{ lookup('ansible.builtin.file', 'files/traces/jaeger/statefulset.yaml') | from_yaml }}" + - name: Create ConfigMap with Python script kubernetes.core.k8s: kubeconfig: "{{ recorders_cluster.kubeconfig }}" @@ -22,29 +26,13 @@ kubernetes.core.k8s: kubeconfig: "{{ recorders_cluster.kubeconfig }}" namespace: "{{ recorders_namespace.name }}" - src: files/traces/jaeger/statefulset.yaml + template: templates/traces/jaeger/statefulset.j2 state: present - -- name: Update Jaeger Traces environment variables - kubernetes.core.k8s: - kubeconfig: "{{ recorders_cluster.kubeconfig }}" - resource_definition: - apiVersion: apps/v1 - kind: StatefulSet - metadata: - name: jaeger-traces-recorder - namespace: "{{ recorders_namespace.name }}" - spec: - template: - spec: - containers: - - name: recorder - env: - - name: JAEGER_ENDPOINT - value: "{{ tools_jaeger_querier_endpoint }}" - state: patched - when: - - tools_jaeger_querier_endpoint is defined + vars: + container_image: "{{ recorders_statefulset.spec.template.spec.containers[0].image }}" + container_environment_variables: + - name: KUBERNETES_TOPOLOGY_MONITOR_ENDPOINT + value: "{{ tools_kubernetes_topology_mapper_endpoint }}" - name: Wait for workload to update kubernetes.core.k8s_info: diff --git a/sre/roles/recorders/tasks/uninstall_alerts_recorders_prometheus.yaml b/sre/roles/recorders/tasks/uninstall_alerts_recorders_prometheus.yaml index 4cb4f5be..6a4fcbe9 100644 --- a/sre/roles/recorders/tasks/uninstall_alerts_recorders_prometheus.yaml +++ b/sre/roles/recorders/tasks/uninstall_alerts_recorders_prometheus.yaml @@ -32,42 +32,31 @@ - name: Delete ConfigMap with Python script kubernetes.core.k8s: + api_version: v1 + delete_all: true + kind: ConfigMap kubeconfig: "{{ recorders_cluster.kubeconfig }}" - resource_definition: - apiVersion: v1 - kind: ConfigMap - metadata: - name: alerts-recorder-prometheus-scripts - namespace: "{{ recorders_namespace.name }}" + namespace: "{{ recorders_namespace.name }}" + label_selectors: + - app.kubernetes.io/name = prometheus-alert-recorder + - app.kubernetes.io/part-of = it-bench state: absent wait: true -- name: Check for Secret with Prometheus bearer token - kubernetes.core.k8s_info: - api_version: route.openshift.io/v1 +- name: Delete Secret with bearer token + kubernetes.core.k8s: + api_version: v1 + delete_all: true kind: Secret kubeconfig: "{{ recorders_cluster.kubeconfig }}" - name: alerts-recorder-prometheus-token namespace: "{{ recorders_namespace.name }}" - register: recorders_prometheus_secret_info - when: - - recorders_cluster.platform == "openshift" - -- name: Delete Secret with Prometheus bearer token - kubernetes.core.k8s: - kubeconfig: "{{ recorders_cluster.kubeconfig }}" - resource_definition: - apiVersion: v1 - kind: Secret - metadata: - name: "{{ recorders_prometheus_secret_info.resources[0].metadata.name }}" - namespace: "{{ recorders_prometheus_secret_info.resources[0].metadata.namespace }}" + label_selectors: + - app.kubernetes.io/name = prometheus-alert-recorder + - app.kubernetes.io/part-of = it-bench state: absent wait: true when: - recorders_cluster.platform == "openshift" - - recorders_prometheus_secret_info is defined - - recorders_prometheus_secret_info.resources | length == 1 - name: Find all exported JSON files ansible.builtin.find: diff --git a/sre/roles/recorders/tasks/uninstall_topology_recorders_kubernetes.yaml b/sre/roles/recorders/tasks/uninstall_topology_recorders_kubernetes.yaml index 5258e183..59cfa136 100644 --- a/sre/roles/recorders/tasks/uninstall_topology_recorders_kubernetes.yaml +++ b/sre/roles/recorders/tasks/uninstall_topology_recorders_kubernetes.yaml @@ -32,13 +32,14 @@ - name: Delete ConfigMap with Python script kubernetes.core.k8s: + api_version: v1 + delete_all: true + kind: ConfigMap kubeconfig: "{{ recorders_cluster.kubeconfig }}" - resource_definition: - apiVersion: v1 - kind: ConfigMap - metadata: - name: topology-recorder-kubernetes-scripts - namespace: "{{ recorders_namespace.name }}" + namespace: "{{ recorders_namespace.name }}" + label_selectors: + - app.kubernetes.io/name = kubernetes-topology-recorder + - app.kubernetes.io/part-of = it-bench state: absent wait: true diff --git a/sre/roles/recorders/tasks/uninstall_traces_recorders_jaeger.yaml b/sre/roles/recorders/tasks/uninstall_traces_recorders_jaeger.yaml index 4971adf0..ee7a5e27 100644 --- a/sre/roles/recorders/tasks/uninstall_traces_recorders_jaeger.yaml +++ b/sre/roles/recorders/tasks/uninstall_traces_recorders_jaeger.yaml @@ -32,13 +32,14 @@ - name: Delete ConfigMap with Python script kubernetes.core.k8s: + api_version: v1 + delete_all: true + kind: ConfigMap kubeconfig: "{{ recorders_cluster.kubeconfig }}" - resource_definition: - apiVersion: v1 - kind: ConfigMap - metadata: - name: traces-recorder-jaeger-scripts - namespace: "{{ recorders_namespace.name }}" + namespace: "{{ recorders_namespace.name }}" + label_selectors: + - app.kubernetes.io/name = jaeger-traces-recorder + - app.kubernetes.io/part-of = it-bench state: absent wait: true diff --git a/sre/roles/recorders/templates/alerts/prometheus/configmap.j2 b/sre/roles/recorders/templates/alerts/prometheus/configmap.j2 index dcd0cbba..1b9a3c6b 100644 --- a/sre/roles/recorders/templates/alerts/prometheus/configmap.j2 +++ b/sre/roles/recorders/templates/alerts/prometheus/configmap.j2 @@ -2,6 +2,9 @@ apiVersion: v1 kind: ConfigMap metadata: + labels: + app.kubernetes.io/name: prometheus-alert-recorder + app.kubernetes.io/part-of: it-bench name: alerts-recorder-prometheus-scripts data: deps: "{{ requirements_file_contents }}" diff --git a/sre/roles/recorders/templates/alerts/prometheus/secret.j2 b/sre/roles/recorders/templates/alerts/prometheus/secret.j2 index d9aa35a2..f731d990 100644 --- a/sre/roles/recorders/templates/alerts/prometheus/secret.j2 +++ b/sre/roles/recorders/templates/alerts/prometheus/secret.j2 @@ -2,6 +2,9 @@ apiVersion: v1 kind: Secret metadata: + labels: + app.kubernetes.io/name: prometheus-alert-recorder + app.kubernetes.io/part-of: it-bench name: alerts-recorder-prometheus-token data: token: "{{ prometheus_bearer_token }}" diff --git a/sre/roles/recorders/templates/alerts/prometheus/statefulset.j2 b/sre/roles/recorders/templates/alerts/prometheus/statefulset.j2 new file mode 100644 index 00000000..25e40d89 --- /dev/null +++ b/sre/roles/recorders/templates/alerts/prometheus/statefulset.j2 @@ -0,0 +1,72 @@ +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + labels: + app.kubernetes.io/name: prometheus-alert-recorder + app.kubernetes.io/part-of: it-bench + name: prometheus-alert-recorder +spec: + selector: + matchLabels: + app.kubernetes.io/name: prometheus-alert-recorder + app.kubernetes.io/part-of: it-bench + template: + metadata: + annotations: + openshift.io/required-scc: restricted-v2 + labels: + app.kubernetes.io/name: prometheus-alert-recorder + app.kubernetes.io/part-of: it-bench + spec: + containers: + - name: recorder + image: "{{ container_image }}" + command: + - /bin/sh + args: + - -c + - "python3.12 -m pip install -r ~/deps/requirements.txt && python3.12 ~/scripts/gather.py" + env: {{ container_environment_variables }} + resources: + requests: + cpu: 100m + memory: 125Mi + limits: + memory: 250Mi + volumeMounts: + - name: dependencies + mountPath: /opt/app-root/src/deps + readOnly: true + - name: scripts + mountPath: /opt/app-root/src/scripts + readOnly: true + - name: prometheus-alert-records + mountPath: /opt/app-root/src/records + securityContext: + fsGroup: 1001 + volumes: + - name: scripts + configMap: + name: alerts-recorder-prometheus-scripts + items: + - key: script + path: gather.py + - name: dependencies + configMap: + name: alerts-recorder-prometheus-scripts + items: + - key: deps + path: requirements.txt + replicas: 1 + volumeClaimTemplates: + - metadata: + name: prometheus-alert-records + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 1Gi + persistentVolumeClaimRetentionPolicy: + whenDeleted: Delete diff --git a/sre/roles/recorders/templates/topology/kubernetes/configmap.j2 b/sre/roles/recorders/templates/topology/kubernetes/configmap.j2 index 58aac8ff..4842e641 100644 --- a/sre/roles/recorders/templates/topology/kubernetes/configmap.j2 +++ b/sre/roles/recorders/templates/topology/kubernetes/configmap.j2 @@ -2,6 +2,9 @@ apiVersion: v1 kind: ConfigMap metadata: + labels: + app.kubernetes.io/name: kubernetes-topology-recorder + app.kubernetes.io/part-of: it-bench name: topology-recorder-kubernetes-scripts data: deps: "{{ requirements_file_contents }}" diff --git a/sre/roles/recorders/templates/topology/kubernetes/statefulset.j2 b/sre/roles/recorders/templates/topology/kubernetes/statefulset.j2 new file mode 100644 index 00000000..9e880351 --- /dev/null +++ b/sre/roles/recorders/templates/topology/kubernetes/statefulset.j2 @@ -0,0 +1,72 @@ +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + labels: + app.kubernetes.io/name: kubernetes-topology-recorder + app.kubernetes.io/part-of: it-bench + name: kubernetes-topology-recorder +spec: + selector: + matchLabels: + app.kubernetes.io/name: kubernetes-topology-recorder + app.kubernetes.io/part-of: it-bench + template: + metadata: + annotations: + openshift.io/required-scc: restricted-v2 + labels: + app.kubernetes.io/name: kubernetes-topology-recorder + app.kubernetes.io/part-of: it-bench + spec: + containers: + - name: recorder + image: "{{ container_image }}" + command: + - /bin/sh + args: + - -c + - "python3.12 -m pip install -r ~/deps/requirements.txt && python3.12 ~/scripts/gather.py" + env: {{ container_environment_variables }} + resources: + requests: + cpu: 100m + memory: 125Mi + limits: + memory: 250Mi + volumeMounts: + - name: dependencies + mountPath: /opt/app-root/src/deps + readOnly: true + - name: scripts + mountPath: /opt/app-root/src/scripts + readOnly: true + - name: kubernetes-topology-records + mountPath: /opt/app-root/src/records + securityContext: + fsGroup: 1001 + volumes: + - name: scripts + configMap: + name: topology-recorder-kubernetes-scripts + items: + - key: script + path: gather.py + - name: dependencies + configMap: + name: topology-recorder-kubernetes-scripts + items: + - key: deps + path: requirements.txt + replicas: 1 + volumeClaimTemplates: + - metadata: + name: kubernetes-topology-records + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 5Gi + persistentVolumeClaimRetentionPolicy: + whenDeleted: Delete diff --git a/sre/roles/recorders/templates/traces/jaeger/configmap.j2 b/sre/roles/recorders/templates/traces/jaeger/configmap.j2 index 65dd4bf5..01240c14 100644 --- a/sre/roles/recorders/templates/traces/jaeger/configmap.j2 +++ b/sre/roles/recorders/templates/traces/jaeger/configmap.j2 @@ -2,6 +2,9 @@ apiVersion: v1 kind: ConfigMap metadata: + labels: + app.kubernetes.io/name: jaeger-traces-recorder + app.kubernetes.io/part-of: it-bench name: traces-recorder-jaeger-scripts data: deps: "{{ requirements_file_contents }}" diff --git a/sre/roles/recorders/templates/traces/jaeger/statefulset.j2 b/sre/roles/recorders/templates/traces/jaeger/statefulset.j2 new file mode 100644 index 00000000..8e919788 --- /dev/null +++ b/sre/roles/recorders/templates/traces/jaeger/statefulset.j2 @@ -0,0 +1,72 @@ +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + labels: + app.kubernetes.io/name: jaeger-traces-recorder + app.kubernetes.io/part-of: it-bench + name: jaeger-traces-recorder +spec: + selector: + matchLabels: + app.kubernetes.io/name: jaeger-traces-recorder + app.kubernetes.io/part-of: it-bench + template: + metadata: + annotations: + openshift.io/required-scc: restricted-v2 + labels: + app.kubernetes.io/name: jaeger-traces-recorder + app.kubernetes.io/part-of: it-bench + spec: + containers: + - name: recorder + image: "{{ container_image }}" + command: + - /bin/sh + args: + - -c + - "python3.12 -m pip install -r ~/deps/requirements.txt && python3.12 ~/scripts/gather.py" + env: {{ container_environment_variables }} + resources: + requests: + cpu: 100m + memory: 125Mi + limits: + memory: 250Mi + volumeMounts: + - name: dependencies + mountPath: /opt/app-root/src/deps + readOnly: true + - name: scripts + mountPath: /opt/app-root/src/scripts + readOnly: true + - name: jaeger-trace-records + mountPath: /opt/app-root/src/records + securityContext: + fsGroup: 1001 + volumes: + - name: scripts + configMap: + name: traces-recorder-jaeger-scripts + items: + - key: script + path: gather.py + - name: dependencies + configMap: + name: traces-recorder-jaeger-scripts + items: + - key: deps + path: requirements.txt + replicas: 1 + volumeClaimTemplates: + - metadata: + name: jaeger-trace-records + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 5Gi + persistentVolumeClaimRetentionPolicy: + whenDeleted: Delete From e882e3d1fa8170160e58f96d15152da92dd06088 Mon Sep 17 00:00:00 2001 From: Gerard Vanloo Date: Wed, 20 Aug 2025 10:43:02 -0400 Subject: [PATCH 23/35] style: become independent of objects names Signed-off-by: Gerard Vanloo --- .../tasks/install_alerts_recorders_prometheus.yaml | 11 +++++++---- .../tasks/install_topology_recorders_kubernetes.yaml | 11 +++++++---- .../tasks/install_traces_recorders_jaeger.yaml | 11 +++++++---- 3 files changed, 21 insertions(+), 12 deletions(-) diff --git a/sre/roles/recorders/tasks/install_alerts_recorders_prometheus.yaml b/sre/roles/recorders/tasks/install_alerts_recorders_prometheus.yaml index f40707f1..08f44ffc 100644 --- a/sre/roles/recorders/tasks/install_alerts_recorders_prometheus.yaml +++ b/sre/roles/recorders/tasks/install_alerts_recorders_prometheus.yaml @@ -68,17 +68,18 @@ namespace: "{{ recorders_namespace.name }}" template: templates/alerts/prometheus/statefulset.j2 state: present + register: recorders_workload vars: container_image: "{{ recorders_statefulset.spec.template.spec.containers[0].image }}" container_environment_variables: "{{ recorders_prometheus_env_vars }}" - name: Wait for workload to update kubernetes.core.k8s_info: - api_version: apps/v1 - kind: StatefulSet + api_version: "{{ recorders_workload.result.api_version }}" + kind: "{{ recorders_workload.result.kind }}" kubeconfig: "{{ recorders_cluster.kubeconfig }}" - name: prometheus-alert-recorder - namespace: "{{ recorders_namespace.name }}" + name: "{{ recorders_workload.result.metadata.name }}" + namespace: "{{ recorders_workload.result.metadata.namespace }}" register: recorders_statefulset_info until: - recorders_statefulset_info.resources | length > 0 @@ -87,3 +88,5 @@ - recorders_statefulset_info.resources[0].status.readyReplicas == 1 retries: 8 delay: 15 + when: + - recorders_workload is defined diff --git a/sre/roles/recorders/tasks/install_topology_recorders_kubernetes.yaml b/sre/roles/recorders/tasks/install_topology_recorders_kubernetes.yaml index 1b9b7376..5c52452f 100644 --- a/sre/roles/recorders/tasks/install_topology_recorders_kubernetes.yaml +++ b/sre/roles/recorders/tasks/install_topology_recorders_kubernetes.yaml @@ -28,6 +28,7 @@ namespace: "{{ recorders_namespace.name }}" template: templates/topology/kubernetes/statefulset.j2 state: present + register: recorders_workload vars: container_image: "{{ recorders_statefulset.spec.template.spec.containers[0].image }}" container_environment_variables: @@ -36,11 +37,11 @@ - name: Wait for workload to update kubernetes.core.k8s_info: - api_version: apps/v1 - kind: StatefulSet + api_version: "{{ recorders_workload.result.api_version }}" + kind: "{{ recorders_workload.result.kind }}" kubeconfig: "{{ recorders_cluster.kubeconfig }}" - name: kubernetes-topology-recorder - namespace: "{{ recorders_namespace.name }}" + name: "{{ recorders_workload.result.metadata.name }}" + namespace: "{{ recorders_workload.result.metadata.namespace }}" register: recorders_statefulset_info until: - recorders_statefulset_info.resources | length > 0 @@ -49,3 +50,5 @@ - recorders_statefulset_info.resources[0].status.readyReplicas == 1 retries: 8 delay: 15 + when: + - recorders_workload is defined diff --git a/sre/roles/recorders/tasks/install_traces_recorders_jaeger.yaml b/sre/roles/recorders/tasks/install_traces_recorders_jaeger.yaml index 46ef840e..76b30619 100644 --- a/sre/roles/recorders/tasks/install_traces_recorders_jaeger.yaml +++ b/sre/roles/recorders/tasks/install_traces_recorders_jaeger.yaml @@ -28,6 +28,7 @@ namespace: "{{ recorders_namespace.name }}" template: templates/traces/jaeger/statefulset.j2 state: present + register: recorders_workload vars: container_image: "{{ recorders_statefulset.spec.template.spec.containers[0].image }}" container_environment_variables: @@ -36,11 +37,11 @@ - name: Wait for workload to update kubernetes.core.k8s_info: - api_version: apps/v1 - kind: StatefulSet + api_version: "{{ recorders_workload.result.api_version }}" + kind: "{{ recorders_workload.result.kind }}" kubeconfig: "{{ recorders_cluster.kubeconfig }}" - name: jaeger-traces-recorder - namespace: "{{ recorders_namespace.name }}" + name: "{{ recorders_workload.result.metadata.name }}" + namespace: "{{ recorders_workload.result.metadata.namespace }}" register: recorders_statefulset_info until: - recorders_statefulset_info.resources | length > 0 @@ -49,3 +50,5 @@ - recorders_statefulset_info.resources[0].status.readyReplicas == 1 retries: 8 delay: 15 + when: + - recorders_workload is defined From f1291a4f67804fd29873fd7158abdf49b35e70f7 Mon Sep 17 00:00:00 2001 From: Gerard Vanloo Date: Wed, 20 Aug 2025 10:48:18 -0400 Subject: [PATCH 24/35] fix: correct prometheus test file Signed-off-by: Gerard Vanloo --- sre/tests/files/prometheus.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sre/tests/files/prometheus.yaml b/sre/tests/files/prometheus.yaml index 74e263f6..9135eb6a 100644 --- a/sre/tests/files/prometheus.yaml +++ b/sre/tests/files/prometheus.yaml @@ -1,11 +1,11 @@ tools: - chaos_mesh: true + chaos_mesh: false clickhouse: false - ingress: false + ingress: true jaeger: false kubernetes_topology_monitor: false kubernetes_metrics_server: false opencost: false opensearch: false opentelemetry: false - prometheus: false + prometheus: true From 674d956724929986cada5081e549a5bafe41bbb5 Mon Sep 17 00:00:00 2001 From: Gerard Vanloo Date: Wed, 20 Aug 2025 11:07:40 -0400 Subject: [PATCH 25/35] fix: correct typing of templated variables Signed-off-by: Gerard Vanloo --- sre/roles/recorders/templates/alerts/prometheus/configmap.j2 | 4 ++-- sre/roles/recorders/templates/alerts/prometheus/secret.j2 | 2 +- .../recorders/templates/alerts/prometheus/statefulset.j2 | 2 +- .../recorders/templates/topology/kubernetes/configmap.j2 | 4 ++-- .../recorders/templates/topology/kubernetes/statefulset.j2 | 2 +- sre/roles/recorders/templates/traces/jaeger/configmap.j2 | 4 ++-- sre/roles/recorders/templates/traces/jaeger/statefulset.j2 | 2 +- 7 files changed, 10 insertions(+), 10 deletions(-) diff --git a/sre/roles/recorders/templates/alerts/prometheus/configmap.j2 b/sre/roles/recorders/templates/alerts/prometheus/configmap.j2 index 1b9a3c6b..e8322148 100644 --- a/sre/roles/recorders/templates/alerts/prometheus/configmap.j2 +++ b/sre/roles/recorders/templates/alerts/prometheus/configmap.j2 @@ -7,5 +7,5 @@ metadata: app.kubernetes.io/part-of: it-bench name: alerts-recorder-prometheus-scripts data: - deps: "{{ requirements_file_contents }}" - script: "{{ python_script_file_contents }}" + deps: {{ requirements_file_contents }} + script: {{ python_script_file_contents }} diff --git a/sre/roles/recorders/templates/alerts/prometheus/secret.j2 b/sre/roles/recorders/templates/alerts/prometheus/secret.j2 index f731d990..bea55c18 100644 --- a/sre/roles/recorders/templates/alerts/prometheus/secret.j2 +++ b/sre/roles/recorders/templates/alerts/prometheus/secret.j2 @@ -7,4 +7,4 @@ metadata: app.kubernetes.io/part-of: it-bench name: alerts-recorder-prometheus-token data: - token: "{{ prometheus_bearer_token }}" + token: {{ prometheus_bearer_token }} diff --git a/sre/roles/recorders/templates/alerts/prometheus/statefulset.j2 b/sre/roles/recorders/templates/alerts/prometheus/statefulset.j2 index 25e40d89..7906d48d 100644 --- a/sre/roles/recorders/templates/alerts/prometheus/statefulset.j2 +++ b/sre/roles/recorders/templates/alerts/prometheus/statefulset.j2 @@ -21,7 +21,7 @@ spec: spec: containers: - name: recorder - image: "{{ container_image }}" + image: {{ container_image }} command: - /bin/sh args: diff --git a/sre/roles/recorders/templates/topology/kubernetes/configmap.j2 b/sre/roles/recorders/templates/topology/kubernetes/configmap.j2 index 4842e641..c4b88f4e 100644 --- a/sre/roles/recorders/templates/topology/kubernetes/configmap.j2 +++ b/sre/roles/recorders/templates/topology/kubernetes/configmap.j2 @@ -7,5 +7,5 @@ metadata: app.kubernetes.io/part-of: it-bench name: topology-recorder-kubernetes-scripts data: - deps: "{{ requirements_file_contents }}" - script: "{{ python_script_file_contents }}" + deps: {{ requirements_file_contents }} + script: {{ python_script_file_contents }} diff --git a/sre/roles/recorders/templates/topology/kubernetes/statefulset.j2 b/sre/roles/recorders/templates/topology/kubernetes/statefulset.j2 index 9e880351..72455363 100644 --- a/sre/roles/recorders/templates/topology/kubernetes/statefulset.j2 +++ b/sre/roles/recorders/templates/topology/kubernetes/statefulset.j2 @@ -21,7 +21,7 @@ spec: spec: containers: - name: recorder - image: "{{ container_image }}" + image: {{ container_image }} command: - /bin/sh args: diff --git a/sre/roles/recorders/templates/traces/jaeger/configmap.j2 b/sre/roles/recorders/templates/traces/jaeger/configmap.j2 index 01240c14..e3a79f86 100644 --- a/sre/roles/recorders/templates/traces/jaeger/configmap.j2 +++ b/sre/roles/recorders/templates/traces/jaeger/configmap.j2 @@ -7,5 +7,5 @@ metadata: app.kubernetes.io/part-of: it-bench name: traces-recorder-jaeger-scripts data: - deps: "{{ requirements_file_contents }}" - script: "{{ python_script_file_contents }}" + deps: {{ requirements_file_contents }} + script: {{ python_script_file_contents }} diff --git a/sre/roles/recorders/templates/traces/jaeger/statefulset.j2 b/sre/roles/recorders/templates/traces/jaeger/statefulset.j2 index 8e919788..190192c5 100644 --- a/sre/roles/recorders/templates/traces/jaeger/statefulset.j2 +++ b/sre/roles/recorders/templates/traces/jaeger/statefulset.j2 @@ -21,7 +21,7 @@ spec: spec: containers: - name: recorder - image: "{{ container_image }}" + image: {{ container_image }} command: - /bin/sh args: From bbb2a2a2894311ee16c5b4c5995fdcde326094a7 Mon Sep 17 00:00:00 2001 From: Gerard Vanloo Date: Wed, 20 Aug 2025 11:51:06 -0400 Subject: [PATCH 26/35] fix: correct confimap spacing Signed-off-by: Gerard Vanloo --- .../recorders/templates/alerts/prometheus/configmap.j2 | 6 ++++-- .../recorders/templates/topology/kubernetes/configmap.j2 | 6 ++++-- sre/roles/recorders/templates/traces/jaeger/configmap.j2 | 6 ++++-- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/sre/roles/recorders/templates/alerts/prometheus/configmap.j2 b/sre/roles/recorders/templates/alerts/prometheus/configmap.j2 index e8322148..045e7ffe 100644 --- a/sre/roles/recorders/templates/alerts/prometheus/configmap.j2 +++ b/sre/roles/recorders/templates/alerts/prometheus/configmap.j2 @@ -7,5 +7,7 @@ metadata: app.kubernetes.io/part-of: it-bench name: alerts-recorder-prometheus-scripts data: - deps: {{ requirements_file_contents }} - script: {{ python_script_file_contents }} + deps: | + {{ requirements_file_contents }} + script: | + {{ python_script_file_contents }} diff --git a/sre/roles/recorders/templates/topology/kubernetes/configmap.j2 b/sre/roles/recorders/templates/topology/kubernetes/configmap.j2 index c4b88f4e..40e03321 100644 --- a/sre/roles/recorders/templates/topology/kubernetes/configmap.j2 +++ b/sre/roles/recorders/templates/topology/kubernetes/configmap.j2 @@ -7,5 +7,7 @@ metadata: app.kubernetes.io/part-of: it-bench name: topology-recorder-kubernetes-scripts data: - deps: {{ requirements_file_contents }} - script: {{ python_script_file_contents }} + deps: | + {{ requirements_file_contents }} + script: | + {{ python_script_file_contents }} diff --git a/sre/roles/recorders/templates/traces/jaeger/configmap.j2 b/sre/roles/recorders/templates/traces/jaeger/configmap.j2 index e3a79f86..0a316291 100644 --- a/sre/roles/recorders/templates/traces/jaeger/configmap.j2 +++ b/sre/roles/recorders/templates/traces/jaeger/configmap.j2 @@ -7,5 +7,7 @@ metadata: app.kubernetes.io/part-of: it-bench name: traces-recorder-jaeger-scripts data: - deps: {{ requirements_file_contents }} - script: {{ python_script_file_contents }} + deps: | + {{ requirements_file_contents }} + script: | + {{ python_script_file_contents }} From 05f4022279ddf6788c3bf93093f01f87835156fb Mon Sep 17 00:00:00 2001 From: Gerard Vanloo Date: Wed, 20 Aug 2025 13:34:46 -0400 Subject: [PATCH 27/35] feat: add export option to s3 bucket Signed-off-by: Gerard Vanloo --- sre/group_vars/all/storage.yaml.example | 5 +++-- sre/playbooks/manage_recorders.yaml | 1 + sre/roles/awx/meta/argument_specs.yaml | 7 +++++-- sre/roles/awx/tasks/launch_workflows.yaml | 8 ++++---- sre/roles/recorders/meta/argument_specs.yaml | 13 +++++++++++++ .../uninstall_alerts_recorders_prometheus.yaml | 14 ++++++++++++++ .../uninstall_topology_recorders_kubernetes.yaml | 14 ++++++++++++++ .../tasks/uninstall_traces_recorders_jaeger.yaml | 14 ++++++++++++++ 8 files changed, 68 insertions(+), 8 deletions(-) diff --git a/sre/group_vars/all/storage.yaml.example b/sre/group_vars/all/storage.yaml.example index ea4c8b9b..c16ef45a 100644 --- a/sre/group_vars/all/storage.yaml.example +++ b/sre/group_vars/all/storage.yaml.example @@ -2,5 +2,6 @@ storage: {} # local: # directory: "" # s3: - # bucket_name: "" - # endpoint_url: "" + # bucket: "" + # directory: "" + # endpoint: "" diff --git a/sre/playbooks/manage_recorders.yaml b/sre/playbooks/manage_recorders.yaml index 17afceb3..12197b34 100644 --- a/sre/playbooks/manage_recorders.yaml +++ b/sre/playbooks/manage_recorders.yaml @@ -50,3 +50,4 @@ jaeger: "{{ incidents_tools.jaeger | default(tools.jaeger) }}" recorders_storage: local: "{{ storage.local | default(omit) }}" + s3: "{{ storage.s3 | default(omit) }}" diff --git a/sre/roles/awx/meta/argument_specs.yaml b/sre/roles/awx/meta/argument_specs.yaml index 9c8b2e9f..55971d19 100644 --- a/sre/roles/awx/meta/argument_specs.yaml +++ b/sre/roles/awx/meta/argument_specs.yaml @@ -60,10 +60,13 @@ argument_specs: required: false type: dict options: - bucket_name: + bucket: required: true type: str - endpoint_url: + directory: + required: true + type: str + endpoint: required: true type: str trials: diff --git a/sre/roles/awx/tasks/launch_workflows.yaml b/sre/roles/awx/tasks/launch_workflows.yaml index 5fa8c651..1b41f55b 100644 --- a/sre/roles/awx/tasks/launch_workflows.yaml +++ b/sre/roles/awx/tasks/launch_workflows.yaml @@ -37,8 +37,8 @@ scenario_number: "{{ item[0] }}" run_number: "{{ item[1] }}" sre_agent_name__version_number: "{{ awx_agent.version }}" - s3_bucket_name_for_results: "{{ awx_experiments.storage.s3.bucket_name | default('') }}" - s3_endpoint_url: "{{ awx_experiments.storage.s3.endpoint_url | default('') }}" + s3_bucket_name_for_results: "{{ awx_experiments.storage.s3.bucket | default('') }}" + s3_endpoint_url: "{{ awx_experiments.storage.s3.endpoint | default('') }}" sre_bench_runner: "{{ sre_bench_runner | default(true) }}" wait: false loop: "{{ awx_experiment_workflows }}" @@ -56,8 +56,8 @@ scenario_number: "{{ item[0] }}" run_number: "{{ item[1] }}" sre_agent_name__version_number: "{{ awx_agent.version }}" - s3_bucket_name_for_results: "{{ awx_experiments.storage.s3.bucket_name | default('') }}" - s3_endpoint_url: "{{ awx_experiments.storage.s3.endpoint_url | default('') }}" + s3_bucket_name_for_results: "{{ awx_experiments.storage.s3.bucket | default('') }}" + s3_endpoint_url: "{{ awx_experiments.storage.s3.endpoint | default('') }}" sre_bench_runner: "{{ sre_bench_runner | default(true) }}" wait: false loop: "{{ awx_experiment_workflows }}" diff --git a/sre/roles/recorders/meta/argument_specs.yaml b/sre/roles/recorders/meta/argument_specs.yaml index 8a826880..865964c9 100644 --- a/sre/roles/recorders/meta/argument_specs.yaml +++ b/sre/roles/recorders/meta/argument_specs.yaml @@ -62,3 +62,16 @@ argument_specs: directory: required: true type: str + s3: + required: false + type: dict + options: + bucket: + required: true + type: str + directory: + required: true + type: str + endpoint: + required: true + type: str diff --git a/sre/roles/recorders/tasks/uninstall_alerts_recorders_prometheus.yaml b/sre/roles/recorders/tasks/uninstall_alerts_recorders_prometheus.yaml index 6a4fcbe9..5b306bd7 100644 --- a/sre/roles/recorders/tasks/uninstall_alerts_recorders_prometheus.yaml +++ b/sre/roles/recorders/tasks/uninstall_alerts_recorders_prometheus.yaml @@ -76,3 +76,17 @@ loop_var: file when: - recorders_storage.local is defined + +- name: Upload exported data to S3 bucket + amazon.aws.s3_object: + endpoint_url: "{{ recorders_storage.s3.endpoint }}" + bucket: "{{ recorders_storage.s3.bucket }}" + object: "/{{ recorders_storage.s3.directory }}/{{ file.path | basename }}" + src: "{{ file.path }}" + mode: put + loop: "{{ recorders_files.files }}" + loop_control: + label: file/{{ file.path | basename }} + loop_var: file + when: + - recorders_storage.s3 is defined diff --git a/sre/roles/recorders/tasks/uninstall_topology_recorders_kubernetes.yaml b/sre/roles/recorders/tasks/uninstall_topology_recorders_kubernetes.yaml index 59cfa136..6818bcc4 100644 --- a/sre/roles/recorders/tasks/uninstall_topology_recorders_kubernetes.yaml +++ b/sre/roles/recorders/tasks/uninstall_topology_recorders_kubernetes.yaml @@ -61,3 +61,17 @@ loop_var: file when: - recorders_storage.local is defined + +- name: Upload exported data to S3 bucket + amazon.aws.s3_object: + endpoint_url: "{{ recorders_storage.s3.endpoint }}" + bucket: "{{ recorders_storage.s3.bucket }}" + object: "/{{ recorders_storage.s3.directory }}/{{ file.path | basename }}" + src: "{{ file.path }}" + mode: put + loop: "{{ recorders_files.files }}" + loop_control: + label: file/{{ file.path | basename }} + loop_var: file + when: + - recorders_storage.s3 is defined diff --git a/sre/roles/recorders/tasks/uninstall_traces_recorders_jaeger.yaml b/sre/roles/recorders/tasks/uninstall_traces_recorders_jaeger.yaml index ee7a5e27..4907ff2e 100644 --- a/sre/roles/recorders/tasks/uninstall_traces_recorders_jaeger.yaml +++ b/sre/roles/recorders/tasks/uninstall_traces_recorders_jaeger.yaml @@ -61,3 +61,17 @@ loop_var: file when: - recorders_storage.local is defined + +- name: Upload exported data to S3 bucket + amazon.aws.s3_object: + endpoint_url: "{{ recorders_storage.s3.endpoint }}" + bucket: "{{ recorders_storage.s3.bucket }}" + object: "/{{ recorders_storage.s3.directory }}/{{ file.path | basename }}" + src: "{{ file.path }}" + mode: put + loop: "{{ recorders_files.files }}" + loop_control: + label: file/{{ file.path | basename }} + loop_var: file + when: + - recorders_storage.s3 is defined From 9492f69b37b11acd5894f7b0e6175b0e0133c0ca Mon Sep 17 00:00:00 2001 From: Gerard Vanloo Date: Wed, 20 Aug 2025 13:54:39 -0400 Subject: [PATCH 28/35] fix: correct indents for configmap Signed-off-by: Gerard Vanloo --- .../recorders/tasks/install_alerts_recorders_prometheus.yaml | 2 +- .../tasks/uninstall_alerts_recorders_prometheus.yaml | 2 ++ .../tasks/uninstall_topology_recorders_kubernetes.yaml | 2 ++ .../recorders/tasks/uninstall_traces_recorders_jaeger.yaml | 2 ++ sre/roles/recorders/templates/alerts/prometheus/configmap.j2 | 4 ++-- .../recorders/templates/topology/kubernetes/configmap.j2 | 4 ++-- sre/roles/recorders/templates/traces/jaeger/configmap.j2 | 4 ++-- 7 files changed, 13 insertions(+), 7 deletions(-) diff --git a/sre/roles/recorders/tasks/install_alerts_recorders_prometheus.yaml b/sre/roles/recorders/tasks/install_alerts_recorders_prometheus.yaml index 08f44ffc..22da8256 100644 --- a/sre/roles/recorders/tasks/install_alerts_recorders_prometheus.yaml +++ b/sre/roles/recorders/tasks/install_alerts_recorders_prometheus.yaml @@ -26,7 +26,7 @@ {{ recorders_prometheus_env_vars + [{ - 'name': PROMETHEUS_TOKEN, + 'name': 'PROMETHEUS_TOKEN', 'valueFrom': { 'secretKeyRef': { 'name': 'alerts-recorder-prometheus-token', diff --git a/sre/roles/recorders/tasks/uninstall_alerts_recorders_prometheus.yaml b/sre/roles/recorders/tasks/uninstall_alerts_recorders_prometheus.yaml index 5b306bd7..359c8c00 100644 --- a/sre/roles/recorders/tasks/uninstall_alerts_recorders_prometheus.yaml +++ b/sre/roles/recorders/tasks/uninstall_alerts_recorders_prometheus.yaml @@ -76,6 +76,7 @@ loop_var: file when: - recorders_storage.local is defined + - recorders_files is defined - name: Upload exported data to S3 bucket amazon.aws.s3_object: @@ -90,3 +91,4 @@ loop_var: file when: - recorders_storage.s3 is defined + - recorders_files is defined diff --git a/sre/roles/recorders/tasks/uninstall_topology_recorders_kubernetes.yaml b/sre/roles/recorders/tasks/uninstall_topology_recorders_kubernetes.yaml index 6818bcc4..538cf68c 100644 --- a/sre/roles/recorders/tasks/uninstall_topology_recorders_kubernetes.yaml +++ b/sre/roles/recorders/tasks/uninstall_topology_recorders_kubernetes.yaml @@ -61,6 +61,7 @@ loop_var: file when: - recorders_storage.local is defined + - recorders_files is defined - name: Upload exported data to S3 bucket amazon.aws.s3_object: @@ -75,3 +76,4 @@ loop_var: file when: - recorders_storage.s3 is defined + - recorders_files is defined diff --git a/sre/roles/recorders/tasks/uninstall_traces_recorders_jaeger.yaml b/sre/roles/recorders/tasks/uninstall_traces_recorders_jaeger.yaml index 4907ff2e..228f6cd4 100644 --- a/sre/roles/recorders/tasks/uninstall_traces_recorders_jaeger.yaml +++ b/sre/roles/recorders/tasks/uninstall_traces_recorders_jaeger.yaml @@ -61,6 +61,7 @@ loop_var: file when: - recorders_storage.local is defined + - recorders_files is defined - name: Upload exported data to S3 bucket amazon.aws.s3_object: @@ -75,3 +76,4 @@ loop_var: file when: - recorders_storage.s3 is defined + - recorders_files is defined diff --git a/sre/roles/recorders/templates/alerts/prometheus/configmap.j2 b/sre/roles/recorders/templates/alerts/prometheus/configmap.j2 index 045e7ffe..321dd63f 100644 --- a/sre/roles/recorders/templates/alerts/prometheus/configmap.j2 +++ b/sre/roles/recorders/templates/alerts/prometheus/configmap.j2 @@ -8,6 +8,6 @@ metadata: name: alerts-recorder-prometheus-scripts data: deps: | - {{ requirements_file_contents }} + {{ requirements_file_contents | indent(width=4) }} script: | - {{ python_script_file_contents }} + {{ python_script_file_contents | indent(width=4) }} diff --git a/sre/roles/recorders/templates/topology/kubernetes/configmap.j2 b/sre/roles/recorders/templates/topology/kubernetes/configmap.j2 index 40e03321..06451695 100644 --- a/sre/roles/recorders/templates/topology/kubernetes/configmap.j2 +++ b/sre/roles/recorders/templates/topology/kubernetes/configmap.j2 @@ -8,6 +8,6 @@ metadata: name: topology-recorder-kubernetes-scripts data: deps: | - {{ requirements_file_contents }} + {{ requirements_file_contents | indent(width=4) }} script: | - {{ python_script_file_contents }} + {{ python_script_file_contents | indent(width=4) }} diff --git a/sre/roles/recorders/templates/traces/jaeger/configmap.j2 b/sre/roles/recorders/templates/traces/jaeger/configmap.j2 index 0a316291..f2ecef78 100644 --- a/sre/roles/recorders/templates/traces/jaeger/configmap.j2 +++ b/sre/roles/recorders/templates/traces/jaeger/configmap.j2 @@ -8,6 +8,6 @@ metadata: name: traces-recorder-jaeger-scripts data: deps: | - {{ requirements_file_contents }} + {{ requirements_file_contents | indent(width=4) }} script: | - {{ python_script_file_contents }} + {{ python_script_file_contents | indent(width=4) }} From 8b39e238f85ad84b66a0d8b016a05a514252a74f Mon Sep 17 00:00:00 2001 From: Gerard Vanloo Date: Wed, 20 Aug 2025 15:19:17 -0400 Subject: [PATCH 29/35] chore: remove book keeping role in place of alert recording Signed-off-by: Gerard Vanloo --- sre/Makefile | 21 +- .../book_keeping/tasks/leverage_ingress.yaml | 18 -- .../tasks/leverage_port_forwarding.yaml | 34 --- sre/roles/book_keeping/tasks/main.yaml | 205 ------------------ .../install_alerts_recorders_prometheus.yaml | 2 +- ...install_topology_recorders_kubernetes.yaml | 4 +- .../install_traces_recorders_jaeger.yaml | 8 +- 7 files changed, 14 insertions(+), 278 deletions(-) delete mode 100644 sre/roles/book_keeping/tasks/leverage_ingress.yaml delete mode 100644 sre/roles/book_keeping/tasks/leverage_port_forwarding.yaml delete mode 100644 sre/roles/book_keeping/tasks/main.yaml diff --git a/sre/Makefile b/sre/Makefile index 4491bd4e..4b632898 100644 --- a/sre/Makefile +++ b/sre/Makefile @@ -1,7 +1,5 @@ # Makefile to run Ansible playbooks -EXECUTE_CHECKS_IN_BACKGROUND ?= false - NUMBER_OF_RUNS = 1 @@ -143,20 +141,6 @@ injection_docs: ## Generates documentation for all fault injection export OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES; \ ansible-playbook base.yaml --tags "injection_docs" -.PHONY: check_alerts -check_alerts: -ifeq ($(EXECUTE_CHECKS_IN_BACKGROUND),false) - ansible-playbook -v base.yaml --tags "book_keeping" \ - --extra-vars "is_book_keeping=true" \ - --extra-vars "sample_application=$(SAMPLE_APPLICATION)" \ - $(BOOK_KEEPING_EXTRA_VARS) -else - nohup ansible-playbook -v base.yaml --tags "book_keeping" \ - --extra-vars "is_book_keeping=true" \ - --extra-vars "sample_application=$(SAMPLE_APPLICATION)" \ - $(BOOK_KEEPING_EXTRA_VARS) & -endif - .SILENT: fetch_alerts fetch_alerts: ansible-playbook -v base.yaml --tags "fetch_alerts" \ @@ -291,3 +275,8 @@ e2e_awx_stage_three: ## DEPRECATED: Given an incident number, run_uuid end the s @echo "Executing 'make launch_stop_workflow'..." @echo "" $(MAKE) launch_stop_workflow + +.PHONY: check_alerts +check_alerts: + @echo "WARNING: 'make check_alerts' is deprecated. Please use 'make deploy_recorders' instead." + @echo "This command will be removed in a future version." diff --git a/sre/roles/book_keeping/tasks/leverage_ingress.yaml b/sre/roles/book_keeping/tasks/leverage_ingress.yaml deleted file mode 100644 index e9b37f21..00000000 --- a/sre/roles/book_keeping/tasks/leverage_ingress.yaml +++ /dev/null @@ -1,18 +0,0 @@ ---- -- name: Get the Ingress URL of Prometheus - ansible.builtin.shell: "KUBECONFIG={{ kubeconfig }} kubectl get ingress prometheus -n {{ prometheus_namespace_project_name }} -o json" - register: prometheus_ingress - retries: 5 - delay: 60 - until: (prometheus_ingress.stdout | length) > 0 - ignore_errors: yes - -- name: Extract the Ingress hostname information - set_fact: - ingress_hostname: "{{ prometheus_ingress.stdout | from_json | json_query('status.loadBalancer.ingress[0].hostname') }}" - when: prometheus_ingress.stdout | trim != '' - -- name: Set the Prometheus URL - set_fact: - prometheus_url: "http://{{ ingress_hostname }}/prometheus" - when: ingress_hostname is defined and ingress_hostname | trim != '' diff --git a/sre/roles/book_keeping/tasks/leverage_port_forwarding.yaml b/sre/roles/book_keeping/tasks/leverage_port_forwarding.yaml deleted file mode 100644 index c834cccb..00000000 --- a/sre/roles/book_keeping/tasks/leverage_port_forwarding.yaml +++ /dev/null @@ -1,34 +0,0 @@ ---- -- name: Check availability of ports - ansible.builtin.shell: | - lsof -i :{{ item }} > /dev/null && echo "in_use" || echo "available" - register: lsof_check - loop: "{{ range(32100, 32125) | list }}" - changed_when: false - failed_when: false - loop_control: - loop_var: item - when: ingress_hostname is undefined or ingress_hostname | trim == '' - -- name: Assign a dynamic port if one is available - set_fact: - dynamic_port: "{{ (lsof_check.results | selectattr('stdout', 'equalto', 'available') | map(attribute='item') | list | first) }}" - when: ingress_hostname is undefined or ingress_hostname | trim == '' - -- name: Kubectl port-forward on/for the prometheus-server service with dynamic port - ansible.builtin.shell: KUBECONFIG={{ kubeconfig }} kubectl -n "{{ prometheus_namespace_project_name }}" port-forward "svc/prometheus-server" "{{ dynamic_port }}:80" --request-timeout=10m - async: 600 - poll: 0 - register: prometheus_port_forward_for_datasources_creation - when: ingress_hostname is undefined or ingress_hostname | trim == '' - -- name: Wait for port-forward to be available - ansible.builtin.wait_for_connection: - delay: 5 - timeout: 30 - when: ingress_hostname is undefined or ingress_hostname | trim == '' - -- name: Set the Prometheus URL - set_fact: - prometheus_url: "http://127.0.0.1:{{ dynamic_port }}" - when: ingress_hostname is undefined or ingress_hostname | trim == '' diff --git a/sre/roles/book_keeping/tasks/main.yaml b/sre/roles/book_keeping/tasks/main.yaml deleted file mode 100644 index 61d1189d..00000000 --- a/sre/roles/book_keeping/tasks/main.yaml +++ /dev/null @@ -1,205 +0,0 @@ ---- -- name: Print prometheus namespace and application namespace - debug: - msg: "Prometheus namespace : {{ prometheus_namespace_project_name }}, Application namespace : {{ otel_astronomy_app_namespace_project_name }}" - tags: - - book_keeping - -- name: Tasks associated with leveraging ingress - ansible.builtin.include_tasks: - file: leverage_ingress.yaml - apply: - tags: - - book_keeping - tags: - - book_keeping - -- name: Tasks associated with leveraging port forwarding - ansible.builtin.include_tasks: - file: leverage_port_forwarding.yaml - apply: - tags: - - book_keeping - tags: - - book_keeping - -- name: Initialize an empty list for selected user details - set_fact: - filtered_alerts: [] - when: - - is_book_keeping - tags: - - book_keeping - -- name: Call the alerts API - uri: - url: "{{ prometheus_url }}/api/v1/alerts" - method: GET - return_content: yes - body_format: json - headers: - Content-Type: "application/json" - register: api_response - until: "{{ api_response.json.data.alerts | selectattr('state', '==', 'firing') | list | length > 0 }}" - retries: 120 - delay: 10 - tags: - - book_keeping - -- name: Debug - All alerts - debug: - var: api_response.json.data.alerts - tags: - - book_keeping - -- name: Parse JSON response and filter for alerts in state firing/alerting - set_fact: - json_data: "{{ api_response.json.data.alerts | selectattr('state', '==', 'firing') | list }}" - tags: - - book_keeping - -- name: Create temporary file - ansible.builtin.tempfile: - state: file - suffix: temp - register: tempfile_for_alerts_in_firing_state - tags: - - book_keeping - -- name: Debug - Alerts in firing / alerting state - debug: - var: json_data - tags: - - book_keeping - -- name: Copy alerts in JSON to temp file - ansible.builtin.copy: - content: "{{ json_data | to_json }}" - dest: "{{ tempfile_for_alerts_in_firing_state.path }}" - tags: - - book_keeping - -- name: Upload Alerts JSON to S3 - amazon.aws.s3_object: - endpoint_url: "{{ s3_endpoint_url }}" - bucket: "{{ s3_bucket_name_for_results }}" - object: "/{{ sre_agent_name__version_number }}/{{run_uuid}}/{{scenario_number}}/{{run_number}}/alerts_in_alerting_state_{{now(utc=true,fmt='%Y-%m-%dT%H:%M:%S.%f')}}.txt" - src: "{{ tempfile_for_alerts_in_firing_state.path }}" - mode: put - tags: - - book_keeping - when: run_uuid is defined and scenario_number is defined and run_number is defined - -- name: Parse alerts to config - set_fact: - filtered_alerts: "{{ filtered_alerts + [{'service_name':item.labels.service_name if item.labels.service_name is defined else 'No service name', 'alert_name':item.labels.alertname, 'alert_active_time':item.activeAt, 'alert_status':item.state}] }}" - with_items: "{{ json_data }}" - when: - - is_book_keeping - tags: - - book_keeping - -- name: Retrieve existing bundle-alert-timestamps ConfigMap - kubernetes.core.k8s_info: - kubeconfig: "{{ kubeconfig }}" - namespace: "{{ otel_astronomy_app_namespace_project_name }}" - kind: ConfigMap - name: bundle-alert-timestamps - register: existing_configmap_info - when: - - is_book_keeping - - sample_application == "otel_astronomy_shop" - tags: - - book_keeping - -- name: Record alert manifestation time in ConfigMap - vars: - configmap_data: - current_alerts : "{{ filtered_alerts }}" - kubernetes.core.k8s: - kubeconfig: "{{ kubeconfig }}" - definition: - apiVersion: v1 - kind: ConfigMap - metadata: - name: bundle-alert-timestamps - namespace: "{{ otel_astronomy_app_namespace_project_name }}" - data: - fault_injection_time: "{{ existing_configmap_info.resources[0].data.fault_injection_time }}" - deployment_time: "{{ existing_configmap_info.resources[0].data.deployment_time }}" - data.json: "{{ configmap_data | to_nice_json }}" - when: - - is_book_keeping - - sample_application == "otel_astronomy_shop" - tags: - - book_keeping - -- name: Retrieve existing bundle-alert-timestamps ConfigMap - kubernetes.core.k8s_info: - kubeconfig: "{{ kubeconfig }}" - namespace: "{{ deathstarbench_hotelreservation_app_namespace_project_name }}" - kind: ConfigMap - name: bundle-alert-timestamps - register: existing_configmap_info - when: - - is_book_keeping - - sample_application == "dsb_hotel_reservation" - tags: - - book_keeping - -- name: Record alert manifestation time in ConfigMap - vars: - configmap_data: - current_alerts : "{{ filtered_alerts }}" - kubernetes.core.k8s: - kubeconfig: "{{ kubeconfig }}" - definition: - apiVersion: v1 - kind: ConfigMap - metadata: - name: bundle-alert-timestamps - namespace: "{{ deathstarbench_hotelreservation_app_namespace_project_name }}" - data: - fault_injection_time: "{{ existing_configmap_info.resources[0].data.fault_injection_time }}" - deployment_time: "{{ existing_configmap_info.resources[0].data.deployment_time }}" - data.json: "{{ configmap_data | to_nice_json }}" - when: - - is_book_keeping - - sample_application == "dsb_hotel_reservation" - tags: - - book_keeping - -- name: Retrieve existing bundle-alert-timestamps ConfigMap - kubernetes.core.k8s_info: - kubeconfig: "{{ kubeconfig }}" - namespace: "{{ elasticsearch_app_namespace_project_name }}" - kind: ConfigMap - name: bundle-alert-timestamps - register: existing_configmap_info - when: - - is_book_keeping - - sample_application == "elasticsearch" - tags: - - book_keeping - -- name: Record alert manifestation time in ConfigMap - vars: - configmap_data: - current_alerts : "{{ filtered_alerts }}" - kubernetes.core.k8s: - kubeconfig: "{{ kubeconfig }}" - definition: - apiVersion: v1 - kind: ConfigMap - metadata: - name: bundle-alert-timestamps - namespace: "{{ elasticsearch_app_namespace_project_name }}" - data: - fault_injection_time: "{{ existing_configmap_info.resources[0].data.fault_injection_time }}" - deployment_time: "{{ existing_configmap_info.resources[0].data.deployment_time }}" - data.json: "{{ configmap_data | to_nice_json }}" - when: - - is_book_keeping - - sample_application == "elasticsearch" - tags: - - book_keeping diff --git a/sre/roles/recorders/tasks/install_alerts_recorders_prometheus.yaml b/sre/roles/recorders/tasks/install_alerts_recorders_prometheus.yaml index 22da8256..eef0f777 100644 --- a/sre/roles/recorders/tasks/install_alerts_recorders_prometheus.yaml +++ b/sre/roles/recorders/tasks/install_alerts_recorders_prometheus.yaml @@ -75,7 +75,7 @@ - name: Wait for workload to update kubernetes.core.k8s_info: - api_version: "{{ recorders_workload.result.api_version }}" + api_version: "{{ recorders_workload.result.apiVersion }}" kind: "{{ recorders_workload.result.kind }}" kubeconfig: "{{ recorders_cluster.kubeconfig }}" name: "{{ recorders_workload.result.metadata.name }}" diff --git a/sre/roles/recorders/tasks/install_topology_recorders_kubernetes.yaml b/sre/roles/recorders/tasks/install_topology_recorders_kubernetes.yaml index 5c52452f..43cb8bdf 100644 --- a/sre/roles/recorders/tasks/install_topology_recorders_kubernetes.yaml +++ b/sre/roles/recorders/tasks/install_topology_recorders_kubernetes.yaml @@ -34,10 +34,12 @@ container_environment_variables: - name: KUBERNETES_TOPOLOGY_MONITOR_ENDPOINT value: "{{ tools_kubernetes_topology_mapper_endpoint }}" + when: + - tools_kubernetes_topology_mapper_endpoint is defined - name: Wait for workload to update kubernetes.core.k8s_info: - api_version: "{{ recorders_workload.result.api_version }}" + api_version: "{{ recorders_workload.result.apiVersion }}" kind: "{{ recorders_workload.result.kind }}" kubeconfig: "{{ recorders_cluster.kubeconfig }}" name: "{{ recorders_workload.result.metadata.name }}" diff --git a/sre/roles/recorders/tasks/install_traces_recorders_jaeger.yaml b/sre/roles/recorders/tasks/install_traces_recorders_jaeger.yaml index 76b30619..75bacf0b 100644 --- a/sre/roles/recorders/tasks/install_traces_recorders_jaeger.yaml +++ b/sre/roles/recorders/tasks/install_traces_recorders_jaeger.yaml @@ -32,12 +32,14 @@ vars: container_image: "{{ recorders_statefulset.spec.template.spec.containers[0].image }}" container_environment_variables: - - name: KUBERNETES_TOPOLOGY_MONITOR_ENDPOINT - value: "{{ tools_kubernetes_topology_mapper_endpoint }}" + - name: JAEGER_ENDPOINT + value: "{{ tools_jaeger_querier_endpoint }}" + when: + - tools_jaeger_querier_endpoint is defined - name: Wait for workload to update kubernetes.core.k8s_info: - api_version: "{{ recorders_workload.result.api_version }}" + api_version: "{{ recorders_workload.result.apiVersion }}" kind: "{{ recorders_workload.result.kind }}" kubeconfig: "{{ recorders_cluster.kubeconfig }}" name: "{{ recorders_workload.result.metadata.name }}" From 1b1f4254aa045648e92e16bba405ac79edfd700a Mon Sep 17 00:00:00 2001 From: Gerard Vanloo Date: Wed, 20 Aug 2025 20:38:57 -0400 Subject: [PATCH 30/35] fix: correct storage variable assignment Signed-off-by: Gerard Vanloo --- sre/group_vars/book_keeping.yaml | 2 -- sre/playbooks/manage_recorders.yaml | 4 +-- .../install_alerts_recorders_prometheus.yaml | 28 ++++++++----------- ...install_topology_recorders_kubernetes.yaml | 28 ++++++++----------- .../install_traces_recorders_jaeger.yaml | 28 ++++++++----------- ...uninstall_alerts_recorders_prometheus.yaml | 17 +++++------ ...install_topology_recorders_kubernetes.yaml | 18 ++++++------ .../uninstall_traces_recorders_jaeger.yaml | 19 ++++++------- 8 files changed, 61 insertions(+), 83 deletions(-) delete mode 100644 sre/group_vars/book_keeping.yaml diff --git a/sre/group_vars/book_keeping.yaml b/sre/group_vars/book_keeping.yaml deleted file mode 100644 index e2f7574a..00000000 --- a/sre/group_vars/book_keeping.yaml +++ /dev/null @@ -1,2 +0,0 @@ ---- -is_book_keeping: false \ No newline at end of file diff --git a/sre/playbooks/manage_recorders.yaml b/sre/playbooks/manage_recorders.yaml index 12197b34..4b14cf96 100644 --- a/sre/playbooks/manage_recorders.yaml +++ b/sre/playbooks/manage_recorders.yaml @@ -48,6 +48,4 @@ kubernetes: "{{ incidents_tools.kubernetes_topology_monitor | default(tools.kubernetes_topology_monitor) }}" traces: jaeger: "{{ incidents_tools.jaeger | default(tools.jaeger) }}" - recorders_storage: - local: "{{ storage.local | default(omit) }}" - s3: "{{ storage.s3 | default(omit) }}" + recorders_storage: "{{ storage }}" diff --git a/sre/roles/recorders/tasks/install_alerts_recorders_prometheus.yaml b/sre/roles/recorders/tasks/install_alerts_recorders_prometheus.yaml index eef0f777..4858f4cc 100644 --- a/sre/roles/recorders/tasks/install_alerts_recorders_prometheus.yaml +++ b/sre/roles/recorders/tasks/install_alerts_recorders_prometheus.yaml @@ -10,15 +10,13 @@ - name: Load statefulset information ansible.builtin.set_fact: - recorders_statefulset: "{{ lookup('ansible.builtin.file', 'files/topology/kubernetes/statefulset.yaml') | from_yaml }}" + recorders_prometheus_statefulset: "{{ lookup('ansible.builtin.file', 'files/topology/kubernetes/statefulset.yaml') | from_yaml }}" - name: Create Prometheus Alert Recorder environment list ansible.builtin.set_fact: recorders_prometheus_env_vars: - name: PROMETHEUS_ENDPOINT value: "{{ tools_prometheus_endpoint }}" - when: - - tools_prometheus_endpoint is defined - name: Add Secret to environment list ansible.builtin.set_fact: @@ -68,25 +66,23 @@ namespace: "{{ recorders_namespace.name }}" template: templates/alerts/prometheus/statefulset.j2 state: present - register: recorders_workload + register: recorders_prometheus_workload vars: - container_image: "{{ recorders_statefulset.spec.template.spec.containers[0].image }}" + container_image: "{{ recorders_prometheus_statefulset.spec.template.spec.containers[0].image }}" container_environment_variables: "{{ recorders_prometheus_env_vars }}" - name: Wait for workload to update kubernetes.core.k8s_info: - api_version: "{{ recorders_workload.result.apiVersion }}" - kind: "{{ recorders_workload.result.kind }}" + api_version: "{{ recorders_prometheus_workload.result.apiVersion }}" + kind: "{{ recorders_prometheus_workload.result.kind }}" kubeconfig: "{{ recorders_cluster.kubeconfig }}" - name: "{{ recorders_workload.result.metadata.name }}" - namespace: "{{ recorders_workload.result.metadata.namespace }}" - register: recorders_statefulset_info + name: "{{ recorders_prometheus_workload.result.metadata.name }}" + namespace: "{{ recorders_prometheus_workload.result.metadata.namespace }}" + register: recorders_prometheus_statefulset_info until: - - recorders_statefulset_info.resources | length > 0 - - recorders_statefulset_info.resources[0].status is defined - - recorders_statefulset_info.resources[0].status.readyReplicas is defined - - recorders_statefulset_info.resources[0].status.readyReplicas == 1 + - recorders_prometheus_statefulset_info.resources | length > 0 + - recorders_prometheus_statefulset_info.resources[0].status is defined + - recorders_prometheus_statefulset_info.resources[0].status.readyReplicas is defined + - recorders_prometheus_statefulset_info.resources[0].status.readyReplicas == 1 retries: 8 delay: 15 - when: - - recorders_workload is defined diff --git a/sre/roles/recorders/tasks/install_topology_recorders_kubernetes.yaml b/sre/roles/recorders/tasks/install_topology_recorders_kubernetes.yaml index 43cb8bdf..c20cdc34 100644 --- a/sre/roles/recorders/tasks/install_topology_recorders_kubernetes.yaml +++ b/sre/roles/recorders/tasks/install_topology_recorders_kubernetes.yaml @@ -10,7 +10,7 @@ - name: Load statefulset information ansible.builtin.set_fact: - recorders_statefulset: "{{ lookup('ansible.builtin.file', 'files/topology/kubernetes/statefulset.yaml') | from_yaml }}" + recorders_kubernetes_statefulset: "{{ lookup('ansible.builtin.file', 'files/topology/kubernetes/statefulset.yaml') | from_yaml }}" - name: Create ConfigMap with Python script kubernetes.core.k8s: @@ -28,29 +28,25 @@ namespace: "{{ recorders_namespace.name }}" template: templates/topology/kubernetes/statefulset.j2 state: present - register: recorders_workload + register: recorders_kubernetes_workload vars: - container_image: "{{ recorders_statefulset.spec.template.spec.containers[0].image }}" + container_image: "{{ recorders_kubernetes_statefulset.spec.template.spec.containers[0].image }}" container_environment_variables: - name: KUBERNETES_TOPOLOGY_MONITOR_ENDPOINT value: "{{ tools_kubernetes_topology_mapper_endpoint }}" - when: - - tools_kubernetes_topology_mapper_endpoint is defined - name: Wait for workload to update kubernetes.core.k8s_info: - api_version: "{{ recorders_workload.result.apiVersion }}" - kind: "{{ recorders_workload.result.kind }}" + api_version: "{{ recorders_kubernetes_workload.result.apiVersion }}" + kind: "{{ recorders_kubernetes_workload.result.kind }}" kubeconfig: "{{ recorders_cluster.kubeconfig }}" - name: "{{ recorders_workload.result.metadata.name }}" - namespace: "{{ recorders_workload.result.metadata.namespace }}" - register: recorders_statefulset_info + name: "{{ recorders_kubernetes_workload.result.metadata.name }}" + namespace: "{{ recorders_kubernetes_workload.result.metadata.namespace }}" + register: recorders_kubernetes_statefulset_info until: - - recorders_statefulset_info.resources | length > 0 - - recorders_statefulset_info.resources[0].status is defined - - recorders_statefulset_info.resources[0].status.readyReplicas is defined - - recorders_statefulset_info.resources[0].status.readyReplicas == 1 + - recorders_kubernetes_statefulset_info.resources | length > 0 + - recorders_kubernetes_statefulset_info.resources[0].status is defined + - recorders_kubernetes_statefulset_info.resources[0].status.readyReplicas is defined + - recorders_kubernetes_statefulset_info.resources[0].status.readyReplicas == 1 retries: 8 delay: 15 - when: - - recorders_workload is defined diff --git a/sre/roles/recorders/tasks/install_traces_recorders_jaeger.yaml b/sre/roles/recorders/tasks/install_traces_recorders_jaeger.yaml index 75bacf0b..11bb02ad 100644 --- a/sre/roles/recorders/tasks/install_traces_recorders_jaeger.yaml +++ b/sre/roles/recorders/tasks/install_traces_recorders_jaeger.yaml @@ -10,7 +10,7 @@ - name: Load statefulset information ansible.builtin.set_fact: - recorders_statefulset: "{{ lookup('ansible.builtin.file', 'files/traces/jaeger/statefulset.yaml') | from_yaml }}" + recorders_jaeger_statefulset: "{{ lookup('ansible.builtin.file', 'files/traces/jaeger/statefulset.yaml') | from_yaml }}" - name: Create ConfigMap with Python script kubernetes.core.k8s: @@ -28,29 +28,25 @@ namespace: "{{ recorders_namespace.name }}" template: templates/traces/jaeger/statefulset.j2 state: present - register: recorders_workload + register: recorders_jaeger_workload vars: - container_image: "{{ recorders_statefulset.spec.template.spec.containers[0].image }}" + container_image: "{{ recorders_jaeger_statefulset.spec.template.spec.containers[0].image }}" container_environment_variables: - name: JAEGER_ENDPOINT value: "{{ tools_jaeger_querier_endpoint }}" - when: - - tools_jaeger_querier_endpoint is defined - name: Wait for workload to update kubernetes.core.k8s_info: - api_version: "{{ recorders_workload.result.apiVersion }}" - kind: "{{ recorders_workload.result.kind }}" + api_version: "{{ recorders_jaeger_workload.result.apiVersion }}" + kind: "{{ recorders_jaeger_workload.result.kind }}" kubeconfig: "{{ recorders_cluster.kubeconfig }}" - name: "{{ recorders_workload.result.metadata.name }}" - namespace: "{{ recorders_workload.result.metadata.namespace }}" - register: recorders_statefulset_info + name: "{{ recorders_jaeger_workload.result.metadata.name }}" + namespace: "{{ recorders_jaeger_workload.result.metadata.namespace }}" + register: recorders_jaeger_statefulset_info until: - - recorders_statefulset_info.resources | length > 0 - - recorders_statefulset_info.resources[0].status is defined - - recorders_statefulset_info.resources[0].status.readyReplicas is defined - - recorders_statefulset_info.resources[0].status.readyReplicas == 1 + - recorders_jaeger_statefulset_info.resources | length > 0 + - recorders_jaeger_statefulset_info.resources[0].status is defined + - recorders_jaeger_statefulset_info.resources[0].status.readyReplicas is defined + - recorders_jaeger_statefulset_info.resources[0].status.readyReplicas == 1 retries: 8 delay: 15 - when: - - recorders_workload is defined diff --git a/sre/roles/recorders/tasks/uninstall_alerts_recorders_prometheus.yaml b/sre/roles/recorders/tasks/uninstall_alerts_recorders_prometheus.yaml index 359c8c00..e7e8dac2 100644 --- a/sre/roles/recorders/tasks/uninstall_alerts_recorders_prometheus.yaml +++ b/sre/roles/recorders/tasks/uninstall_alerts_recorders_prometheus.yaml @@ -8,19 +8,18 @@ label_selectors: - app.kubernetes.io/name = prometheus-alert-recorder - app.kubernetes.io/part-of = it-bench - register: recorders_pods_info + register: recorders_prometheus_pods_info - name: Copy records directory from pod kubernetes.core.k8s_cp: kubeconfig: "{{ recorders_cluster.kubeconfig }}" local_path: /tmp/alerts - namespace: "{{ recorders_pods_info.resources[0].metadata.namespace }}" - pod: "{{ recorders_pods_info.resources[0].metadata.name }}" + namespace: "{{ recorders_prometheus_pods_info.resources[0].metadata.namespace }}" + pod: "{{ recorders_prometheus_pods_info.resources[0].metadata.name }}" remote_path: /opt/app-root/src/records state: from_pod when: - - recorders_pods_info is defined - - recorders_pods_info.resources | length == 1 + - recorders_prometheus_pods_info.resources | length == 1 - name: Uninstall Prometheus Alert Recorder kubernetes.core.k8s: @@ -63,20 +62,19 @@ path: /tmp/alerts patterns: - "*.json" - register: recorders_files + register: recorders_prometheus_files - name: Copy exported data into local directory ansible.builtin.copy: dest: "{{ recorders_storage.local.directory }}/{{ file.path | basename }}" mode: "0644" src: "{{ file.path }}" - loop: "{{ recorders_files.files }}" + loop: "{{ recorders_prometheus_files.files }}" loop_control: label: file/{{ file.path | basename }} loop_var: file when: - recorders_storage.local is defined - - recorders_files is defined - name: Upload exported data to S3 bucket amazon.aws.s3_object: @@ -85,10 +83,9 @@ object: "/{{ recorders_storage.s3.directory }}/{{ file.path | basename }}" src: "{{ file.path }}" mode: put - loop: "{{ recorders_files.files }}" + loop: "{{ recorders_prometheus_files.files }}" loop_control: label: file/{{ file.path | basename }} loop_var: file when: - recorders_storage.s3 is defined - - recorders_files is defined diff --git a/sre/roles/recorders/tasks/uninstall_topology_recorders_kubernetes.yaml b/sre/roles/recorders/tasks/uninstall_topology_recorders_kubernetes.yaml index 538cf68c..83e2fc0c 100644 --- a/sre/roles/recorders/tasks/uninstall_topology_recorders_kubernetes.yaml +++ b/sre/roles/recorders/tasks/uninstall_topology_recorders_kubernetes.yaml @@ -8,19 +8,19 @@ label_selectors: - app.kubernetes.io/name = kubernetes-topology-recorder - app.kubernetes.io/part-of = it-bench - register: recorders_pods_info + register: recorders_kubernetes_pods_info - name: Copy records directory from pod kubernetes.core.k8s_cp: kubeconfig: "{{ recorders_cluster.kubeconfig }}" local_path: /tmp/topology - namespace: "{{ recorders_pods_info.resources[0].metadata.namespace }}" - pod: "{{ recorders_pods_info.resources[0].metadata.name }}" + namespace: "{{ recorders_kubernetes_pods_info.resources[0].metadata.namespace }}" + pod: "{{ recorders_kubernetes_pods_info.resources[0].metadata.name }}" remote_path: /opt/app-root/src/records state: from_pod when: - - recorders_pods_info is defined - - recorders_pods_info.resources | length == 1 + - recorders_kubernetes_pods_info is defined + - recorders_kubernetes_pods_info.resources | length == 1 - name: Uninstall Kubernetes Topology Monitor Recorder kubernetes.core.k8s: @@ -48,20 +48,19 @@ path: /tmp/topology patterns: - "*.json" - register: recorders_files + register: recorders_kubernetes_files - name: Copy exported data into local directory ansible.builtin.copy: dest: "{{ recorders_storage.local.directory }}/{{ file.path | basename }}" mode: "0644" src: "{{ file.path }}" - loop: "{{ recorders_files.files }}" + loop: "{{ recorders_kubernetes_files.files }}" loop_control: label: file/{{ file.path | basename }} loop_var: file when: - recorders_storage.local is defined - - recorders_files is defined - name: Upload exported data to S3 bucket amazon.aws.s3_object: @@ -70,10 +69,9 @@ object: "/{{ recorders_storage.s3.directory }}/{{ file.path | basename }}" src: "{{ file.path }}" mode: put - loop: "{{ recorders_files.files }}" + loop: "{{ recorders_kubernetes_files.files }}" loop_control: label: file/{{ file.path | basename }} loop_var: file when: - recorders_storage.s3 is defined - - recorders_files is defined diff --git a/sre/roles/recorders/tasks/uninstall_traces_recorders_jaeger.yaml b/sre/roles/recorders/tasks/uninstall_traces_recorders_jaeger.yaml index 228f6cd4..eea20be5 100644 --- a/sre/roles/recorders/tasks/uninstall_traces_recorders_jaeger.yaml +++ b/sre/roles/recorders/tasks/uninstall_traces_recorders_jaeger.yaml @@ -8,19 +8,18 @@ label_selectors: - app.kubernetes.io/name = jaeger-traces-recorder - app.kubernetes.io/part-of = it-bench - register: recorders_pods_info + register: recorders_jaeger_pods_info - name: Copy records directory from pod kubernetes.core.k8s_cp: kubeconfig: "{{ recorders_cluster.kubeconfig }}" local_path: /tmp/traces - namespace: "{{ recorders_pods_info.resources[0].metadata.namespace }}" - pod: "{{ recorders_pods_info.resources[0].metadata.name }}" + namespace: "{{ recorders_jaeger_pods_info.resources[0].metadata.namespace }}" + pod: "{{ recorders_jaeger_pods_info.resources[0].metadata.name }}" remote_path: /opt/app-root/src/records state: from_pod when: - - recorders_pods_info is defined - - recorders_pods_info.resources | length == 1 + - recorders_jaeger_pods_info.resources | length == 1 - name: Uninstall Jaeger Traces Recorder kubernetes.core.k8s: @@ -48,20 +47,20 @@ path: /tmp/traces patterns: - "*.json" - register: recorders_files + register: recorders_jaeger_files - name: Copy exported data into local directory ansible.builtin.copy: dest: "{{ recorders_storage.local.directory }}/{{ file.path | basename }}" mode: "0644" src: "{{ file.path }}" - loop: "{{ recorders_files.files }}" + loop: "{{ recorders_jaeger_files.files }}" loop_control: label: file/{{ file.path | basename }} loop_var: file when: - recorders_storage.local is defined - - recorders_files is defined + - recorders_jaeger_files is defined - name: Upload exported data to S3 bucket amazon.aws.s3_object: @@ -70,10 +69,10 @@ object: "/{{ recorders_storage.s3.directory }}/{{ file.path | basename }}" src: "{{ file.path }}" mode: put - loop: "{{ recorders_files.files }}" + loop: "{{ recorders_jaeger_files.files }}" loop_control: label: file/{{ file.path | basename }} loop_var: file when: - recorders_storage.s3 is defined - - recorders_files is defined + - recorders_jaeger_files is defined From dbf9abc78675de5c35ec482efc254a786e44e34a Mon Sep 17 00:00:00 2001 From: Gerard Vanloo Date: Wed, 20 Aug 2025 20:40:29 -0400 Subject: [PATCH 31/35] chore: add note about additional awx jobs Signed-off-by: Gerard Vanloo --- sre/roles/awx/tasks/configure_jobs.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/sre/roles/awx/tasks/configure_jobs.yaml b/sre/roles/awx/tasks/configure_jobs.yaml index 2582fd82..580b7c34 100644 --- a/sre/roles/awx/tasks/configure_jobs.yaml +++ b/sre/roles/awx/tasks/configure_jobs.yaml @@ -170,6 +170,7 @@ # TODO: Complete the refactoring of the following code: # # Telemetry Access code will be added directly to the Applications Role +# A 10 minute pause will also be needed - name: Creating/removing job template to setup for telemetry access awx.awx.job_template: From a2f74892d3b3106c1e6ee832f8368e0b7e39ed76 Mon Sep 17 00:00:00 2001 From: Gerard Vanloo Date: Wed, 20 Aug 2025 21:48:02 -0400 Subject: [PATCH 32/35] fix: correct lambda expression Signed-off-by: Gerard Vanloo --- sre/roles/recorders/files/alerts/prometheus/scripts/gather.py | 2 +- .../recorders/tasks/install_alerts_recorders_prometheus.yaml | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/sre/roles/recorders/files/alerts/prometheus/scripts/gather.py b/sre/roles/recorders/files/alerts/prometheus/scripts/gather.py index a50f3aae..aac69b8d 100644 --- a/sre/roles/recorders/files/alerts/prometheus/scripts/gather.py +++ b/sre/roles/recorders/files/alerts/prometheus/scripts/gather.py @@ -47,7 +47,7 @@ def main(): content = response.json() alerts = content.get("data", {}).get("alerts", []) - firing_alerts = list(filter(lambda a: a.get("state", "") == "firing")) + firing_alerts = list(filter(lambda a: a.get("state", "") == "firing", alerts)) logger.info("retrieved {0} alerts from prometheus server".format(len(alerts))) logger.info("retrieved {0} alerts are in firing state".format(len(firing_alerts))) diff --git a/sre/roles/recorders/tasks/install_alerts_recorders_prometheus.yaml b/sre/roles/recorders/tasks/install_alerts_recorders_prometheus.yaml index 4858f4cc..800db890 100644 --- a/sre/roles/recorders/tasks/install_alerts_recorders_prometheus.yaml +++ b/sre/roles/recorders/tasks/install_alerts_recorders_prometheus.yaml @@ -36,7 +36,6 @@ when: - recorders_cluster.platform == "openshift" - tools_prometheus_bearer_token is defined - - recorders_prometheus_env_vars is defined - name: Create Secret with bearer token kubernetes.core.k8s: From 4d6e574d8515e4a36b82674942ae39a605e59bb7 Mon Sep 17 00:00:00 2001 From: Gerard Vanloo Date: Fri, 22 Aug 2025 10:07:38 -0400 Subject: [PATCH 33/35] refactor: change kubernetes topology recorder to job based design Signed-off-by: Gerard Vanloo --- .../files/topology/kubernetes/deployment.yaml | 45 +++++++++++++++++++ .../kubernetes/{statefulset.yaml => job.yaml} | 4 +- .../kubernetes/persistentvolumeclaim.yaml | 15 +++++++ .../topology/kubernetes/scripts/gather.py | 3 -- ...install_topology_recorders_kubernetes.yaml | 45 ++++++++++--------- ...install_topology_recorders_kubernetes.yaml | 39 ++++++++++++++-- .../kubernetes/{statefulset.j2 => job.j2} | 27 ++++------- 7 files changed, 129 insertions(+), 49 deletions(-) create mode 100644 sre/roles/recorders/files/topology/kubernetes/deployment.yaml rename sre/roles/recorders/files/topology/kubernetes/{statefulset.yaml => job.yaml} (90%) create mode 100644 sre/roles/recorders/files/topology/kubernetes/persistentvolumeclaim.yaml rename sre/roles/recorders/templates/topology/kubernetes/{statefulset.j2 => job.j2} (75%) diff --git a/sre/roles/recorders/files/topology/kubernetes/deployment.yaml b/sre/roles/recorders/files/topology/kubernetes/deployment.yaml new file mode 100644 index 00000000..ea978207 --- /dev/null +++ b/sre/roles/recorders/files/topology/kubernetes/deployment.yaml @@ -0,0 +1,45 @@ +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app.kubernetes.io/name: kubernetes-topology-record-retriever + app.kubernetes.io/part-of: it-bench + name: kubernetes-topology-record-retriever +spec: + selector: + matchLabels: + app.kubernetes.io/name: kubernetes-topology-record-retriever + app.kubernetes.io/part-of: it-bench + template: + metadata: + annotations: + openshift.io/required-scc: restricted-v2 + labels: + app.kubernetes.io/name: kubernetes-topology-record-retriever + app.kubernetes.io/part-of: it-bench + spec: + containers: + - name: retriever + image: registry.access.redhat.com/ubi9/ubi-minimal:9.6-1755695350 + command: + - /bin/sh + args: + - -c + - "sleep 600" + resources: + requests: + cpu: 10m + memory: 50Mi + limits: + memory: 100Mi + volumeMounts: + - name: records + mountPath: /opt/app-root/src/records + securityContext: + fsGroup: 1001 + volumes: + - name: records + persistentVolumeClaim: + claimName: kubernetes-topology-records + replicas: 1 diff --git a/sre/roles/recorders/files/topology/kubernetes/statefulset.yaml b/sre/roles/recorders/files/topology/kubernetes/job.yaml similarity index 90% rename from sre/roles/recorders/files/topology/kubernetes/statefulset.yaml rename to sre/roles/recorders/files/topology/kubernetes/job.yaml index 5bc17cac..a49dbb7c 100644 --- a/sre/roles/recorders/files/topology/kubernetes/statefulset.yaml +++ b/sre/roles/recorders/files/topology/kubernetes/job.yaml @@ -3,8 +3,8 @@ # Dependabot to track the image. The full definition is provided in the # template version of this object. -apiVersion: apps/v1 -kind: StatefulSet +apiVersion: batch/v1 +kind: Job metadata: name: kubernetes-topology-recorder spec: diff --git a/sre/roles/recorders/files/topology/kubernetes/persistentvolumeclaim.yaml b/sre/roles/recorders/files/topology/kubernetes/persistentvolumeclaim.yaml new file mode 100644 index 00000000..8b8b16a7 --- /dev/null +++ b/sre/roles/recorders/files/topology/kubernetes/persistentvolumeclaim.yaml @@ -0,0 +1,15 @@ +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + labels: + app.kubernetes.io/name: kubernetes-topology-recorder + app.kubernetes.io/part-of: it-bench + name: kubernetes-topology-records +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 5Gi + volumeMode: Filesystem diff --git a/sre/roles/recorders/files/topology/kubernetes/scripts/gather.py b/sre/roles/recorders/files/topology/kubernetes/scripts/gather.py index b5992810..cf9d7763 100644 --- a/sre/roles/recorders/files/topology/kubernetes/scripts/gather.py +++ b/sre/roles/recorders/files/topology/kubernetes/scripts/gather.py @@ -48,8 +48,5 @@ def main(): with open(file_path, "w") as f: json.dump(content, f, indent=4) - logger.debug("download complete. begin sleeping.") - time.sleep(1800) - if __name__ == "__main__": main() diff --git a/sre/roles/recorders/tasks/install_topology_recorders_kubernetes.yaml b/sre/roles/recorders/tasks/install_topology_recorders_kubernetes.yaml index c20cdc34..29367cd6 100644 --- a/sre/roles/recorders/tasks/install_topology_recorders_kubernetes.yaml +++ b/sre/roles/recorders/tasks/install_topology_recorders_kubernetes.yaml @@ -8,9 +8,16 @@ kubeconfig: "{{ recorders_cluster.kubeconfig }}" platform: "{{ recorders_cluster.platform }}" -- name: Load statefulset information +- name: Load job information ansible.builtin.set_fact: - recorders_kubernetes_statefulset: "{{ lookup('ansible.builtin.file', 'files/topology/kubernetes/statefulset.yaml') | from_yaml }}" + recorders_kubernetes_job: "{{ lookup('ansible.builtin.file', 'files/topology/kubernetes/job.yaml') | from_yaml }}" + +- name: Create PersistentVolumeClaim to retain records + kubernetes.core.k8s: + kubeconfig: "{{ recorders_cluster.kubeconfig }}" + namespace: "{{ recorders_namespace.name }}" + src: files/topology/kubernetes/persistentvolumeclaim.yaml + state: present - name: Create ConfigMap with Python script kubernetes.core.k8s: @@ -22,31 +29,27 @@ python_script_file_contents: "{{ lookup('ansible.builtin.file', 'files/topology/kubernetes/scripts/gather.py') }}" requirements_file_contents: "{{ lookup('ansible.builtin.file', 'files/topology/kubernetes/scripts/requirements.txt') }}" +- name: Wait for any ongoing jobs to be removed + kubernetes.core.k8s_info: + api_version: "{{ recorders_kubernetes_job.apiVersion }}" + kind: "{{ recorders_kubernetes_job.kind }}" + kubeconfig: "{{ recorders_cluster.kubeconfig }}" + name: "{{ recorders_kubernetes_job.metadata.name }}" + namespace: "{{ recorders_namespace.name }}" + register: recorders_kubernetes_job_info + until: + - recorders_kubernetes_job_info.resources | length == 0 + retries: 8 + delay: 15 + - name: Install Kubernetes Topology Recorder kubernetes.core.k8s: kubeconfig: "{{ recorders_cluster.kubeconfig }}" namespace: "{{ recorders_namespace.name }}" - template: templates/topology/kubernetes/statefulset.j2 + template: templates/topology/kubernetes/job.j2 state: present - register: recorders_kubernetes_workload vars: - container_image: "{{ recorders_kubernetes_statefulset.spec.template.spec.containers[0].image }}" + container_image: "{{ recorders_kubernetes_job.spec.template.spec.containers[0].image }}" container_environment_variables: - name: KUBERNETES_TOPOLOGY_MONITOR_ENDPOINT value: "{{ tools_kubernetes_topology_mapper_endpoint }}" - -- name: Wait for workload to update - kubernetes.core.k8s_info: - api_version: "{{ recorders_kubernetes_workload.result.apiVersion }}" - kind: "{{ recorders_kubernetes_workload.result.kind }}" - kubeconfig: "{{ recorders_cluster.kubeconfig }}" - name: "{{ recorders_kubernetes_workload.result.metadata.name }}" - namespace: "{{ recorders_kubernetes_workload.result.metadata.namespace }}" - register: recorders_kubernetes_statefulset_info - until: - - recorders_kubernetes_statefulset_info.resources | length > 0 - - recorders_kubernetes_statefulset_info.resources[0].status is defined - - recorders_kubernetes_statefulset_info.resources[0].status.readyReplicas is defined - - recorders_kubernetes_statefulset_info.resources[0].status.readyReplicas == 1 - retries: 8 - delay: 15 diff --git a/sre/roles/recorders/tasks/uninstall_topology_recorders_kubernetes.yaml b/sre/roles/recorders/tasks/uninstall_topology_recorders_kubernetes.yaml index 83e2fc0c..a07ba83a 100644 --- a/sre/roles/recorders/tasks/uninstall_topology_recorders_kubernetes.yaml +++ b/sre/roles/recorders/tasks/uninstall_topology_recorders_kubernetes.yaml @@ -1,12 +1,35 @@ --- -- name: Retrieve the topology recorder pod name +- name: Wait for any ongoing jobs to be removed + kubernetes.core.k8s_info: + api_version: batch/v1 + kind: Job + kubeconfig: "{{ recorders_cluster.kubeconfig }}" + label_selectors: + - app.kubernetes.io/name = kubernetes-topology-recorder + - app.kubernetes.io/part-of = it-bench + namespace: "{{ recorders_namespace.name }}" + register: recorders_kubernetes_job_info + until: + - recorders_kubernetes_job_info.resources | length == 0 + retries: 8 + delay: 15 + +- name: Create Deployment to retrieve records + kubernetes.core.k8s: + kubeconfig: "{{ recorders_cluster.kubeconfig }}" + namespace: "{{ recorders_namespace.name }}" + src: files/topology/kubernetes/deployment.yaml + state: present + wait: true + +- name: Retrieve the retriever pod name kubernetes.core.k8s_info: api_version: v1 kind: Pod kubeconfig: "{{ recorders_cluster.kubeconfig }}" namespace: "{{ recorders_namespace.name }}" label_selectors: - - app.kubernetes.io/name = kubernetes-topology-recorder + - app.kubernetes.io/name = kubernetes-topology-record-retriever - app.kubernetes.io/part-of = it-bench register: recorders_kubernetes_pods_info @@ -22,11 +45,11 @@ - recorders_kubernetes_pods_info is defined - recorders_kubernetes_pods_info.resources | length == 1 -- name: Uninstall Kubernetes Topology Monitor Recorder +- name: Uninstall the Deployment kubernetes.core.k8s: kubeconfig: "{{ recorders_cluster.kubeconfig }}" namespace: "{{ recorders_namespace.name }}" - src: files/alerts/prometheus/statefulset.yaml + src: files/topology/kubernetes/deployment.yaml state: absent wait: true @@ -43,6 +66,14 @@ state: absent wait: true +- name: Delete PersistentVolumeClaim + kubernetes.core.k8s: + kubeconfig: "{{ recorders_cluster.kubeconfig }}" + namespace: "{{ recorders_namespace.name }}" + src: files/topology/kubernetes/persistentvolumeclaim.yaml + state: absent + wait: true + - name: Find all exported JSON files ansible.builtin.find: path: /tmp/topology diff --git a/sre/roles/recorders/templates/topology/kubernetes/statefulset.j2 b/sre/roles/recorders/templates/topology/kubernetes/job.j2 similarity index 75% rename from sre/roles/recorders/templates/topology/kubernetes/statefulset.j2 rename to sre/roles/recorders/templates/topology/kubernetes/job.j2 index 72455363..82b6f685 100644 --- a/sre/roles/recorders/templates/topology/kubernetes/statefulset.j2 +++ b/sre/roles/recorders/templates/topology/kubernetes/job.j2 @@ -1,16 +1,12 @@ --- -apiVersion: apps/v1 -kind: StatefulSet +apiVersion: batch/v1 +kind: Job metadata: labels: app.kubernetes.io/name: kubernetes-topology-recorder app.kubernetes.io/part-of: it-bench name: kubernetes-topology-recorder spec: - selector: - matchLabels: - app.kubernetes.io/name: kubernetes-topology-recorder - app.kubernetes.io/part-of: it-bench template: metadata: annotations: @@ -41,8 +37,9 @@ spec: - name: scripts mountPath: /opt/app-root/src/scripts readOnly: true - - name: kubernetes-topology-records + - name: records mountPath: /opt/app-root/src/records + restartPolicy: Never securityContext: fsGroup: 1001 volumes: @@ -58,15 +55,7 @@ spec: items: - key: deps path: requirements.txt - replicas: 1 - volumeClaimTemplates: - - metadata: - name: kubernetes-topology-records - spec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: 5Gi - persistentVolumeClaimRetentionPolicy: - whenDeleted: Delete + - name: records + persistentVolumeClaim: + claimName: kubernetes-topology-records + ttlSecondsAfterFinished: 10 From d2202f4aa865bcb72d9cd77438715eab888243e9 Mon Sep 17 00:00:00 2001 From: Gerard Vanloo Date: Fri, 22 Aug 2025 10:30:27 -0400 Subject: [PATCH 34/35] chore: update timing of recorders in make commands Signed-off-by: Gerard Vanloo --- sre/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sre/Makefile b/sre/Makefile index 4b632898..e162bd8a 100644 --- a/sre/Makefile +++ b/sre/Makefile @@ -103,7 +103,7 @@ create_environment: deploy_tools deploy_applications ## Deploys tools and applic destroy_environment: undeploy_applications undeploy_tools ## Undeploys tools and applications to cluster .PHONY: start_incident -start_incident: create_environment inject_incident_fault deploy_recorders ## Starts an incident by deploying a stack, applications, faults, and recorders for an incident +start_incident: create_environment deploy_recorders inject_incident_fault ## Starts an incident by deploying a stack, applications, faults, and recorders for an incident .PHONY: stop_incident stop_incident: undeploy_recorders remove_incident_fault destroy_environment ## Stops an incident by undeploying a stack, applications, faults, and recorders for an incident From df75db9dc1174e9d1313e6b45f938e87308c40dc Mon Sep 17 00:00:00 2001 From: Gerard Vanloo Date: Fri, 22 Aug 2025 11:22:54 -0400 Subject: [PATCH 35/35] fix: update file path Signed-off-by: Gerard Vanloo --- .../recorders/tasks/install_alerts_recorders_prometheus.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sre/roles/recorders/tasks/install_alerts_recorders_prometheus.yaml b/sre/roles/recorders/tasks/install_alerts_recorders_prometheus.yaml index 800db890..b3014433 100644 --- a/sre/roles/recorders/tasks/install_alerts_recorders_prometheus.yaml +++ b/sre/roles/recorders/tasks/install_alerts_recorders_prometheus.yaml @@ -10,7 +10,7 @@ - name: Load statefulset information ansible.builtin.set_fact: - recorders_prometheus_statefulset: "{{ lookup('ansible.builtin.file', 'files/topology/kubernetes/statefulset.yaml') | from_yaml }}" + recorders_prometheus_statefulset: "{{ lookup('ansible.builtin.file', 'files/alerts/prometheus/statefulset.yaml') | from_yaml }}" - name: Create Prometheus Alert Recorder environment list ansible.builtin.set_fact: