Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 12 additions & 6 deletions sre/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,10 @@ launch_start_workflow: ## Launches the AWX workflow's equivalent of make start_i
launch_stop_workflow: ## Launches the AWX workflow's equivalent of make stop_incident
ansible-playbook -i inventory.yaml playbooks/manage_awx.yaml --tags "launch_stop_workflow"

.PHONY: generate_leaderboard_bundle
generate_leaderboard_bundle: ## Generates a bundle for Leaderboard to interface with
ansible-playbook -i inventory.yaml playbooks/generate_leaderboard_bundle.yaml

# TODO: See why the OBJC_DISABLE_INITIALIZE_FORK_SAFETY is needed

.PHONY: incident documentation
Expand Down Expand Up @@ -164,12 +168,6 @@ evaluation:
cat evaluation/e2e_new/incident_reports/$(INCIDENT_NUMBER).json
rm evaluation/e2e_new/incident_reports/$(INCIDENT_NUMBER).json

.SILENT: bundle_status
bundle_status:
ansible-playbook base.yaml --tags "bundle_status" --extra-vars "run_uuid=${RUN_UUID} sre_agent_name__version_number=${PARTICIPANT_AGENT_UUID} scenario_number=${INCIDENT_NUMBER} run_number=1 bundle_status_file=roles/bundle_status/status-$$INCIDENT_NUMBER.json s3_bucket_name_for_results='sre-runner-with-awx' sre_bench_runner=true domain=sre s3_endpoint_url='https://s3.us-east-2.amazonaws.com'" > /tmp/status.out; \
cat roles/bundle_status/status-${INCIDENT_NUMBER}.json
rm roles/bundle_status/status-${INCIDENT_NUMBER}.json

.SILENT: bundle_info
bundle_info:
ANSIBLE_STDOUT_CALLBACK=json ansible-playbook base.yaml --tags "get_bundle_info" --extra-vars "run_uuid=${RUN_UUID} sre_agent_name__version_number=${PARTICIPANT_AGENT_UUID} scenario_number=${INCIDENT_NUMBER} run_number=1 s3_bucket_name_for_results='sre-runner-with-awx' sre_bench_runner=true kubeconfig=/tmp/${CLUSTER_ASSIGNED_NAME}.yaml s3_endpoint_url='https://s3.us-east-2.amazonaws.com'" | jq '.plays[].tasks[] | select(.task.name == "bundle_info : Return grafana url").hosts.localhost.msg'
Expand Down Expand Up @@ -273,3 +271,11 @@ e2e_awx_stage_three: ## DEPRECATED: Given an incident number, run_uuid end the s
@echo "Executing 'make launch_stop_workflow'..."
@echo ""
$(MAKE) launch_stop_workflow

.SILENT: bundle_status
bundle_status:
@echo "WARNING: 'make bundle_status is deprecated. Please use 'make generate_leaderboard_bundle' instead."
@echo "This command will be removed in a future version."
@echo "Executing 'make generate_leaderboard_bundle'..."
@echo ""
$(MAKE) generate_leaderboard_bundle
103 changes: 103 additions & 0 deletions sre/playbooks/generate_leaderboard_bundle.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
---
- name: Manage SRE and FinOps Leaderboard Bundle
hosts:
- environment
pre_tasks:
- name: Import system role
ansible.builtin.import_role:
name: system
tags:
- always
vars:
system_cluster:
kubeconfig: "{{ cluster.kubeconfig }}"

- name: Validate that storage is configured
ansible.builtin.assert:
that: "storage.local is defined or storage.s3 is defined"
fail_msg: Storage has not been configured. Please assign either local or s3 bucket storage.
success_msg: Storage is configured.
tasks:
- name: Copy status file into temporary directory from local directory
ansible.builtin.copy:
dest: /tmp/assertion.json
mode: "0644"
src: "{{ storage.local.directory }}/status.json"
when:
- storage.local is defined

- name: Copy status file into temporary directory from S3 bucket
amazon.aws.s3_object:
endpoint_url: "{{ storage.s3.endpoint }}"
bucket: "{{ storage.s3.bucket }}"
object: "/{{ storage.s3.directory }}/assertion.json"
dest: /tmp/assertion.json
mode: get
register: bucket_retrieval_result
until:
- bucket_retrieval_result.contents != ""
retries: 3
delay: 60
when:
- storage.s3 is defined

- name: Load assertion file contents
ansible.builtin.set_fact:
deployed_condition: "{{ assertion.status.conditions | selectattr('type', 'equalto', 'Deployed') | list | first | default({}) }}"
fault_injected_condition: "{{ assertion.status.conditions | selectattr('type', 'equalto', 'FaultInjected') | list | first | default({}) }}"
destroyed_condition: "{{ assertion.status.conditions | selectattr('type', 'equalto', 'Destroyed') | list | first | default({}) }}"
tags:
- always
vars:
assertions: "{{ lookup('ansible.builtin.file', '/tmp/assertion.json') | from_json }}"

- name: Construct bundle
ansible.builtin.set_fact:
leaderboard_bundle:
status:
conditions:
- type: Deployed
status: "{{ deployed_condition.status | default(false) | string }}"
reason: |
{{
"DeploymentInProgress" if deployed_condition.status is undefined else "DeploymentReady" if deployed_condition.status else "DeploymentFailed"
}}
lastTransitionTime: "{{ deployed_condition.lastTransitionTime | default(ansible_date_time.iso8601) }}"
- type: FaultInjected
status: "{{ fault_injected_condition.status | default(false) | string }}"
reason: |
{{
"DeploymentNotReady" if deployed_condition.status is undefined else "DeploymentFailed" if not deployed_condition.status else "FaultInjectionInProgress" if fault_injected_condition.status is undefined else "FaultInjectionReady" if fault_injected_condition.status else "FaultInjectionFailed"
}}
lastTransitionTime: "{{ fault_injected_condition.lastTransitionTime | default(ansible_date_time.iso8601) }}"
- type: Destroyed
status: "{{ destroyed_condition.status | default(false) | string }}"
reason: |
{{
"FaultInjectionNotReady" if fault_injected_condition.status is undefined else "FaultInjectionFailed" if not fault_injected_condition.status else "DestroyInProgress" if destroyed_condition.status is undefined else "DestroyDone" if destroyed_condition.status else "DestroyFailed"
}}
lastTransitionTime: "{{ destroyed_condition.lastTransitionTime | default(ansible_date_time.iso8601) }}"

- name: Create bundle file
ansible.builtin.copy:
content: "{{ leaderboard_bundle.status | to_nice_json(indent=2) }}"
dest: "/tmp/bundle_status.json"
mode: "0644"

- name: Copy bundle file into local directory
ansible.builtin.copy:
dest: "{{ storage.local.directory }}/bundle_status.json"
mode: "0644"
src: /tmp/status.json
when:
- storage.local is defined

- name: Upload bundle file to S3 bucket
amazon.aws.s3_object:
endpoint_url: "{{ storage.s3.endpoint }}"
bucket: "{{ storage.s3.bucket }}"
object: "/{{ storage.s3.directory }}/bundle_status.json"
src: /tmp/status.json
mode: put
when:
- storage.s3 is defined
67 changes: 41 additions & 26 deletions sre/playbooks/manage_applications.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,31 +23,46 @@
when:
- incident_id is defined
tasks:
- name: Import applications role
ansible.builtin.import_role:
name: applications
vars:
applications_cluster:
kubeconfig: "{{ cluster.kubeconfig }}"
applications_required:
otel_demo:
configuration: "{{ incidents_applications.otel_demo.configuration | default({}) }}"
enabled: "{{ incidents_applications.otel_demo.enabled | default(applications.otel_demo) }}"
- name: Import required roles
block:
- name: Import leaderboard role
ansible.builtin.import_role:
name: leaderboard
vars:
leaderboard_status:
status: progressing
leaderboard_storage: "{{ storage }}"

- name: Import applications role
ansible.builtin.import_role:
name: applications
vars:
applications_cluster:
kubeconfig: "{{ cluster.kubeconfig }}"
applications_required:
otel_demo:
configuration: "{{ incidents_applications.otel_demo.configuration | default({}) }}"
enabled: "{{ incidents_applications.otel_demo.enabled | default(applications.otel_demo) }}"

# - name: Import e2e role
# ansible.builtin.import_role:
# name: e2e
# tasks_from: register_deployment_time
# tags:
# - install_applications
# when:
# - incident.runner != 'local'
- name: Import leaderboard role
ansible.builtin.import_role:
name: leaderboard
vars:
leaderboard_status:
status: succeeded
leaderboard_storage: "{{ storage }}"
rescue:
- name: Import leaderboard role
ansible.builtin.import_role:
name: leaderboard
vars:
leaderboard_status:
failed_task:
name: "{{ ansible_failed_task.name }}"
result: "{{ ansible_failed_result }}"
status: failed
leaderboard_storage: "{{ storage }}"

# - name: Import e2e role
# ansible.builtin.import_role:
# name: e2e
# tasks_from: register_deployment_failure
# tags:
# - capture_failed_deployment_state
# when:
# - incident.runner != 'local'
- name: Fail playbook
ansible.builtin.fail:
msg: "Task ({{ ansible_failed_task.name }}) has failed: {{ ansible_failed_result }}"
109 changes: 71 additions & 38 deletions sre/playbooks/manage_incidents.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,45 +40,78 @@
incidents_file:
id: "{{ incident_id }}"
tasks:
# - name: Pause for 600 seconds pre-fault removal for alert recording
# ansible.builtin.pause:
# seconds: 600
# tags:
# - pre_fault_removal
# when:
# - incident.runner != 'local'
- name: Import required roles
block:
- name: Import leaderboard role
ansible.builtin.import_role:
name: leaderboard
vars:
leaderboard_status:
status: progressing
leaderboard_storage: "{{ storage }}"

# - name: Import e2e role
# ansible.builtin.import_role:
# name: e2e
# tasks_from: record_topology_information
# tags:
# - pre_fault_removal
# when:
# - incident.runner != 'local'
# - name: Pause for 600 seconds pre-fault removal for alert recording
# ansible.builtin.pause:
# seconds: 600
# tags:
# - pre_fault_removal
# when:
# - incident.runner != 'local'

- name: Import faults role
ansible.builtin.import_role:
name: faults
vars:
faults_cluster:
kubeconfig: "{{ cluster.kubeconfig }}"
faults_specs: "{{ incidents_spec.spec.faults }}"
# - name: Import e2e role
# ansible.builtin.import_role:
# name: e2e
# tasks_from: record_topology_information
# tags:
# - pre_fault_removal
# when:
# - incident.runner != 'local'

- name: Import faults role
ansible.builtin.import_role:
name: faults
vars:
faults_cluster:
kubeconfig: "{{ cluster.kubeconfig }}"
faults_specs: "{{ incidents_spec.spec.faults }}"

# - name: Import e2e role
# ansible.builtin.import_role:
# name: e2e
# tasks_from: record_topology_information
# tags:
# - post_fault_injection
# when:
# - incident.runner != 'local'

# - name: Import e2e role
# ansible.builtin.import_role:
# name: e2e
# tasks_from: register_fault_injection_time
# tags:
# - inject_faults
# when:
# - incident.runner != 'local'

# - name: Import e2e role
# ansible.builtin.import_role:
# name: e2e
# tasks_from: record_topology_information
# tags:
# - post_fault_injection
# when:
# - incident.runner != 'local'
- name: Import leaderboard role
ansible.builtin.import_role:
name: leaderboard
vars:
leaderboard_status:
status: succeeded
leaderboard_storage: "{{ storage }}"
rescue:
- name: Import leaderboard role
ansible.builtin.import_role:
name: leaderboard
vars:
leaderboard_status:
failed_task:
name: "{{ ansible_failed_task.name }}"
result: "{{ ansible_failed_result }}"
status: failed
leaderboard_storage: "{{ storage }}"

# - name: Import e2e role
# ansible.builtin.import_role:
# name: e2e
# tasks_from: register_fault_injection_time
# tags:
# - inject_faults
# when:
# - incident.runner != 'local'
- name: Fail playbook
ansible.builtin.fail:
msg: "Task ({{ ansible_failed_task.name }}) has failed: {{ ansible_failed_result }}"
Loading
Loading