Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions .github/workflows/sre-integration-smoke-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,44 @@ concurrency:
cancel-in-progress: true

jobs:
agent-access:
name: Agent Access Smoke Tests
needs:
- opentelemetry
- otel-demo
- prometheus
runs-on: ubuntu-24.04
steps:
- uses: actions/checkout@v5.0.0
- uses: actions/setup-python@v5.6.0
with:
python-version: '3.12'
- uses: actions/setup-go@v5.5.0
with:
go-version-file: sre/dev/local_cluster/go.mod
cache-dependency-path: sre/dev/local_cluster/go.sum
- uses: azure/setup-helm@v4.3.1
with:
version: v3.18.3
- name: Install Python and Ansible dependencies
run: |
pip install -r sre/requirements.txt
ansible-galaxy install -r sre/requirements.yaml
- name: Create Kind cluster
run: |
make -C sre/dev/local_cluster create_cluster
- name: Create group vars
run: |
make -C sre group_vars
- name: Create environment
run: |
INCIDENT_NUMBER=1 make -C sre create_environment
- name: Test access granting
run: |
make -C sre enable_agent_access
- name: Test access revoking
run: |
make -C sre disable_agent_access
chaos-mesh:
name: Chaos Mesh Smoke Tests
runs-on: ubuntu-24.04
Expand Down
8 changes: 8 additions & 0 deletions sre/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,14 @@ start_incident: create_environment inject_incident_fault ## Starts an incident b
.PHONY: stop_incident
stop_incident: remove_incident_fault destroy_environment ## Stops an incident by undeploying a stack, application, and fault for an incident

.PHONY: enable_agent_access
enable_agent_access: ## Adds access controls for llm agent
ansible-playbook -i inventory.yaml playbooks/manage_agent_access.yaml --tags "grant_access"

.PHONY: disable_agent_access
disable_agent_access: ## Removes access controls for llm agent
ansible-playbook -i inventory.yaml playbooks/manage_agent_access.yaml --tags "revoke_access"

.PHONY: deploy_awx_stack
deploy_awx_stack: ## Deploys AWX to a cluster
ansible-playbook -i inventory.yaml playbooks/manage_awx.yaml --tags "install_tools"
Expand Down
41 changes: 41 additions & 0 deletions sre/playbooks/manage_agent_access.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
---
- name: Manage SRE and FinOps Agent Namespace Access
hosts:
- environment
pre_tasks:
- name: Import system role
ansible.builtin.import_role:
name: system
tags:
- always
vars:
system_cluster:
kubeconfig: "{{ cluster.kubeconfig }}"
tasks:
- name: Include Helm Release variables from tools role
ansible.builtin.include_vars:
file: ../roles/tools/defaults/main/helm_releases.yaml
tags:
- always

- name: Include Helm Release variables from applications role
ansible.builtin.include_vars:
file: ../roles/applications/defaults/main/helm_releases.yaml
tags:
- always

- name: Import agent role
ansible.builtin.import_role:
name: agent
vars:
agent_cluster:
kubeconfig: "{{ cluster.kubeconfig }}"
agent_namespaces:
- name: "{{ applications_helm_releases.otel_demo.namespace }}"
access: read-write
- name: "{{ tools_helm_releases.chaos_mesh.namespace }}"
access: read-write
- name: "{{ tools_helm_releases.opentelemetry_collectors.namespace }}"
access: read-only
- name: "{{ tools_helm_releases.prometheus.namespace }}"
access: read-only
2 changes: 1 addition & 1 deletion sre/playbooks/manage_incidents.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
---
- name: Manage SRE and FinOps Incident Environment Tool Stack
- name: Manage SRE and FinOps Incident (Fault Stack)
hosts:
- environment
pre_tasks:
Expand Down
35 changes: 35 additions & 0 deletions sre/roles/agent/meta/argument_specs.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
---
argument_specs:
main:
short_description: Main entry point for applications role
description:
- This role is responsible for installing and uninstalling the applications that faults will be injected to for benchmarking.
author:
- Gerard Vanloo
options:
agent_cluster:
description:
- The Kubernetes or OpenShift cluster that the application(s) will be installed to or uninstalled from.
required: true
type: dict
options:
kubeconfig:
description:
- The file path of the Kubernetes configuration file.
- Use '~' over '$HOME' to substitute for the root directory or provide the absolute path.
required: true
type: str
agent_namespaces:
elements: dict
required: true
type: list
options:
access:
choices:
- read-only
- read-write
required: true
type: str
name:
required: true
type: str
81 changes: 81 additions & 0 deletions sre/roles/agent/tasks/add_namespace_rbac.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
---
- name: Retrieve information about namespace
kubernetes.core.k8s_info:
api_version: v1
kind: Namespace
kubeconfig: "{{ agent_cluster.kubeconfig }}"
namespace: "{{ namespace.name }}"
register: agent_namespace_info

- name: Create Service Account
kubernetes.core.k8s:
kubeconfig: "{{ agent_cluster.kubeconfig }}"
resource_definition:
apiVersion: v1
kind: ServiceAccount
metadata:
labels:
app.kubernetes.io/component: access-provider
app.kubernetes.io/name: agent
app.kubernetes.io/part-of: it-bench
name: agent
namespace: "{{ namespace.name }}"
state: present
register: agent_service_account
when:
- agent_namespace_info.resources | length == 1

- name: Create Role with read-write access
kubernetes.core.k8s:
kubeconfig: "{{ agent_cluster.kubeconfig }}"
namespace: "{{ namespace.name }}"
template: templates/role.j2
state: present
register: agent_rw_role
vars:
verbs:
- "*"
when:
- namespace.access == 'read-write'
- agent_namespace_info.resources | length == 1

- name: Create RoleBinding
kubernetes.core.k8s:
kubeconfig: "{{ agent_cluster.kubeconfig }}"
namespace: "{{ namespace.name }}"
template: templates/rolebinding.j2
state: present
vars:
role: "{{ agent_rw_role.result }}"
service_account: "{{ agent_service_account.result }}"
when:
- namespace.access == 'read-write'
- agent_namespace_info.resources | length == 1

- name: Create Role with read-only access
kubernetes.core.k8s:
kubeconfig: "{{ agent_cluster.kubeconfig }}"
namespace: "{{ namespace.name }}"
template: templates/role.j2
state: present
register: agent_ro_role
vars:
verbs:
- "get"
- "list"
when:
- namespace.access == 'read-only'
- agent_namespace_info.resources | length == 1

- name: Create RoleBinding
kubernetes.core.k8s:
kubeconfig: "{{ agent_cluster.kubeconfig }}"
namespace: "{{ namespace.name }}"
template: templates/rolebinding.j2
state: present
vars:
role: "{{ agent_ro_role.result }}"
service_account: "{{ agent_service_account.result }}"
when:
- namespace.access == 'read-only'
- agent_namespace_info.resources | length == 1
63 changes: 63 additions & 0 deletions sre/roles/agent/tasks/generate_restricted_kubeconfig.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
---
- name: Retrieve Service Accounts for namespaces
kubernetes.core.k8s_info:
api_version: v1
kind: ServiceAccount
kubeconfig: "{{ agent_cluster.kubeconfig }}"
label_selectors:
- app.kubernetes.io/component = access-provider
- app.kubernetes.io/part-of = it-bench
register: agent_service_account_info

- name: Create TokenRequest for each Service Account
kubernetes.core.k8s:
kubeconfig: "{{ agent_cluster.kubeconfig }}"
resource_definition:
apiVersion: authentication.k8s.io/v1
kind: TokenRequest
metadata:
name: "{{ sa.metadata.name }}"
namespace: "{{ sa.metadata.namespace }}"
spec:
expirationSeconds: 64800 # 18h
state: present
loop: "{{ agent_service_account_info.resources }}"
loop_control:
label: serviceaccount/{{ sa.metadata.name }}
loop_var: sa
register: agent_token_request

- name: Load kubeconfig file
ansible.builtin.set_fact:
agent_kubeconfig: "{{ lookup('ansible.builtin.file', agent_cluster.kubeconfig) | from_yaml }}"

- name: Find index of current context cluster
ansible.builtin.set_fact:
agent_cluster_index: "{{ lookup('ansible.utils.index_of', agent_kubeconfig.clusters, 'eq', cluster, 'name') }}"
vars:
cluster: "{{ agent_kubeconfig['current-context'] }}"

- name: Generate contexts and users
ansible.builtin.set_fact:
agent_contexts: "{{ (agent_contexts | default([])) + [{'cluster': cluster, 'user': 'agent', 'namespace': result.metadata.namespace}] }}"
agent_users: "{{ (agent_users | default([])) + [{'name': 'agent', 'user': {'token': result.status.token}}] }}"
loop: "{{ agent_token_request.results }}"
loop_control:
label: tokenrequest/{{ result.metadata.name }}
loop_var: tokenrequest
vars:
cluster: "{{ agent_kubeconfig['current-context'] }}"
when:
- kv.value != ""

- name: Create restricted kubeconfig file in temporary directory
ansible.builtin.copy:
content: "{{ lookup('ansible.builtin.template', 'templates/kubeconfig.j2') | from_yaml | to_nice_yaml(indent=2) }}"
dest: /tmp/restricted_config
mode: "0644"
vars:
certificate_authority_data: "{{ agent_kubeconfig.clusters[agent_cluster_index].cluster['certificate-authority-data'] }}"
cluster: "{{ agent_kubeconfig.clusters[agent_cluster_index].name }}"
contexts: "{{ agent_contexts }}"
server: "{{ agent_kubeconfig.clusters[agent_cluster_index].cluster.server }}"
users: "{{ agent_users }}"
31 changes: 31 additions & 0 deletions sre/roles/agent/tasks/main.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
---
- name: Include tasks for granting agent access to namespaces
ansible.builtin.include_tasks:
file: add_namespace_rbac.yaml
apply:
tags:
- grant_access
loop: "{{ agent_namespaces }}"
loop_control:
label: namespace/{{ namespace.name }}
loop_var: namespace
tags:
- grant_access

- name: Include tasks for restricted kubeconfig generation
ansible.builtin.include_tasks:
file: generate_restricted_kubeconfig.yaml
apply:
tags:
- grant_access
tags:
- grant_access

- name: Include tasks for revoking agent access to namespaces
ansible.builtin.include_tasks:
file: remove_namespace_rbac.yaml
apply:
tags:
- revoke_access
tags:
- revoke_access
36 changes: 36 additions & 0 deletions sre/roles/agent/tasks/remove_namespace_rbac.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
---
- name: Delete RoleBinding
kubernetes.core.k8s:
api_version: rbac.authorization.k8s.io/v1
delete_all: true
kind: RoleBinding
kubeconfig: "{{ agent_cluster.kubeconfig }}"
label_selectors:
- app.kubernetes.io/component = access-provider
- app.kubernetes.io/part-of = it-bench
state: absent
wait: true

- name: Delete Role
kubernetes.core.k8s:
api_version: rbac.authorization.k8s.io/v1
delete_all: true
kind: Role
kubeconfig: "{{ agent_cluster.kubeconfig }}"
label_selectors:
- app.kubernetes.io/component = access-provider
- app.kubernetes.io/part-of = it-bench
state: absent
wait: true

- name: Delete Service Account
kubernetes.core.k8s:
api_version: v1
delete_all: true
kind: ServiceAccount
kubeconfig: "{{ agent_cluster.kubeconfig }}"
label_selectors:
- app.kubernetes.io/component = access-provider
- app.kubernetes.io/part-of = it-bench
state: absent
wait: true
12 changes: 12 additions & 0 deletions sre/roles/agent/templates/kubeconfig.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
---
apiVersion: v1
kind: Config
preferences: {}
clusters:
- cluster:
server: {{ server }}
certificate-authority-data: {{ certificate_authority_data }}
name: {{ cluster }}
contexts: {{ contexts }}
current-context: {{ cluster }}
users: {{ users }}
Loading
Loading