11
2- name : Test deployment and image build on OpenStack
2+ name : Test deployment and reimage on OpenStack
33on :
44 workflow_dispatch :
55 push :
88 pull_request :
99jobs :
1010 openstack :
11- name : openstack-ci-${{ matrix.cloud }}
12- strategy :
13- matrix :
14- cloud :
15- - " arcus" # Arcus OpenStack in rcp-cloud-portal-demo project, with RoCE
16- fail-fast : false # as want clouds to continue independently
11+ name : openstack-ci-arcus # Arcus OpenStack in rcp-cloud-portal-demo project, with RoCE
1712 concurrency : ${{ github.ref }} # to branch/PR
1813 runs-on : ubuntu-20.04
1914 env :
@@ -27,13 +22,13 @@ jobs:
2722 run : |
2823 set -x
2924 mkdir ~/.ssh
30- echo "${${{ matrix.cloud }}_SSH_KEY }" > ~/.ssh/id_rsa
25+ echo "${arcus_SSH_KEY }" > ~/.ssh/id_rsa
3126 chmod 0600 ~/.ssh/id_rsa
3227 env :
3328 arcus_SSH_KEY : ${{ secrets.ARCUS_SSH_KEY }}
3429
3530 - name : Add bastion's ssh key to known_hosts
36- run : cat environments/${{ matrix.cloud }} /bastion_fingerprint >> ~/.ssh/known_hosts
31+ run : cat environments/.stackhpc /bastion_fingerprint >> ~/.ssh/known_hosts
3732 shell : bash
3833
3934 - name : Install ansible etc
@@ -44,38 +39,38 @@ jobs:
4439
4540 - name : Initialise terraform
4641 run : terraform init
47- working-directory : ${{ github.workspace }}/environments/${{ matrix.cloud }} /terraform
42+ working-directory : ${{ github.workspace }}/environments/.stackhpc /terraform
4843
4944 - name : Write clouds.yaml
5045 run : |
5146 mkdir -p ~/.config/openstack/
52- echo "${${{ matrix.cloud }}_CLOUDS_YAML }" > ~/.config/openstack/clouds.yaml
47+ echo "${arcus_CLOUDS_YAML }" > ~/.config/openstack/clouds.yaml
5348 shell : bash
5449 env :
5550 arcus_CLOUDS_YAML : ${{ secrets.ARCUS_CLOUDS_YAML }}
5651
5752 - name : Setup environment-specific inventory/terraform inputs
5853 run : |
5954 . venv/bin/activate
60- . environments/${{ matrix.cloud }} /activate
55+ . environments/.stackhpc /activate
6156 ansible-playbook ansible/adhoc/generate-passwords.yml
6257 echo vault_testuser_password: "$TESTUSER_PASSWORD" > $APPLIANCES_ENVIRONMENT_ROOT/inventory/group_vars/all/test_user.yml
6358 env :
6459 TESTUSER_PASSWORD : ${{ secrets.TEST_USER_PASSWORD }}
6560
66- - name : Provision servers
61+ - name : Provision nodes using fat image
6762 id : provision_servers
6863 run : |
6964 . venv/bin/activate
70- . environments/${{ matrix.cloud }} /activate
65+ . environments/.stackhpc /activate
7166 cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
7267 terraform apply -auto-approve
7368
7469 - name : Get server provisioning failure messages
7570 id : provision_failure
7671 run : |
7772 . venv/bin/activate
78- . environments/${{ matrix.cloud }} /activate
73+ . environments/.stackhpc /activate
7974 cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
8075 TF_FAIL_MSGS="$(../../skeleton/\{\{cookiecutter.environment\}\}/terraform/getfaults.py $PWD)"
8176 echo TF failure messages: $TF_FAIL_MSGS
@@ -85,29 +80,29 @@ jobs:
8580 - name : Delete infrastructure if failed due to lack of hosts
8681 run : |
8782 . venv/bin/activate
88- . environments/${{ matrix.cloud }} /activate
83+ . environments/.stackhpc /activate
8984 cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
9085 terraform destroy -auto-approve
9186 if : ${{ always() && steps.provision_servers.outcome == 'failure' && contains(steps.provision_failure.messages, 'not enough hosts available') }}
9287
93- - name : Directly configure cluster
88+ - name : Configure cluster
9489 run : |
9590 . venv/bin/activate
96- . environments/${{ matrix.cloud }} /activate
91+ . environments/.stackhpc /activate
9792 ansible all -m wait_for_connection
9893 ansible-playbook -v ansible/site.yml
9994 ansible-playbook -v ansible/ci/check_slurm.yml
10095
10196 - name : Run MPI-based tests
10297 run : |
10398 . venv/bin/activate
104- . environments/${{ matrix.cloud }} /activate
99+ . environments/.stackhpc /activate
105100 ansible-playbook -vv ansible/adhoc/hpctests.yml
106101
107102 - name : Confirm Open Ondemand is up (via SOCKS proxy)
108103 run : |
109104 . venv/bin/activate
110- . environments/${{ matrix.cloud }} /activate
105+ . environments/.stackhpc /activate
111106
112107 # load ansible variables into shell:
113108 ansible-playbook ansible/ci/output_vars.yml \
@@ -135,63 +130,55 @@ jobs:
135130 env :
136131 TESTUSER_PASSWORD : ${{ secrets.TEST_USER_PASSWORD }}
137132
138- - name : Build packer images
139- id : packer_build
140- run : |
141- . venv/bin/activate
142- . environments/${{ matrix.cloud }}/activate
143- cd packer/
144- PACKER_LOG=1 packer build -on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl
145- ../dev/output_manifest.py packer-manifest.json # Sets NEW_{COMPUTE,CONTROL,LOGIN}_IMAGE_ID outputs
146-
147- - name : Test reimage of login nodes (via rebuild adhoc)
148- run : |
149- . venv/bin/activate
150- . environments/${{ matrix.cloud }}/activate
151- ansible-playbook -v --limit login ansible/adhoc/rebuild.yml -e rebuild_image=${{ steps.packer_build.outputs.NEW_LOGIN_IMAGE_ID }}
152- ansible login -m wait_for_connection -a 'delay=60 timeout=600' # delay allows node to go down
153- ansible-playbook -v ansible/ci/check_slurm.yml
154-
155- - name : Test reimage of compute nodes (via slurm)
156- run : |
157- . venv/bin/activate
158- . environments/${{ matrix.cloud }}/activate
159- ansible login -v -a "sudo scontrol reboot ASAP nextstate=RESUME reason='rebuild image:${{ steps.packer_build.outputs.NEW_COMPUTE_IMAGE_ID }}' ${TF_VAR_cluster_name}-compute-[0-3]"
160- ansible compute -m wait_for_connection -a 'delay=60 timeout=600' # delay allows node to go down
161- ansible-playbook -v ansible/ci/check_slurm.yml
133+ # - name: Build environment-specific compute image
134+ # id: packer_build
135+ # run: |
136+ # . venv/bin/activate
137+ # . environments/.stackhpc/activate
138+ # cd packer/
139+ # packer init
140+ # PACKER_LOG=1 packer build -except openstack.fatimage -on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl
141+ # ../dev/output_manifest.py packer-manifest.json # Sets NEW_COMPUTE_IMAGE_ID outputs
142+
143+ # - name: Test reimage of compute nodes to new environment-specific image (via slurm)
144+ # run: |
145+ # . venv/bin/activate
146+ # . environments/.stackhpc/activate
147+ # ansible login -v -a "sudo scontrol reboot ASAP nextstate=RESUME reason='rebuild image:${{ steps.packer_build.outputs.NEW_COMPUTE_IMAGE_ID }}' ${TF_VAR_cluster_name}-compute-[0-3]"
148+ # ansible compute -m wait_for_connection -a 'delay=60 timeout=600' # delay allows node to go down
149+ # ansible-playbook -v ansible/ci/check_slurm.yml
162150
163- - name : Test reimage of control node (via rebuild adhoc)
151+ - name : Test reimage of all nodes (via rebuild adhoc)
164152 run : |
165153 . venv/bin/activate
166- . environments/${{ matrix.cloud }}/activate
167- ansible-playbook -v --limit control ansible/adhoc/rebuild.yml -e rebuild_image=${{ steps.packer_build.outputs.NEW_CONTROL_IMAGE_ID }}
168- ansible control -m wait_for_connection -a 'delay=60 timeout=600' # delay allows node to go down
169- ansible-playbook ansible/slurm.yml --tags openhpc # configures partitions
170- ansible-playbook ansible/monitoring.yml --tags prometheus # configures scrapes
154+ . environments/.stackhpc/activate
155+ ansible-playbook -v --limit control,login ansible/adhoc/rebuild.yml
156+ ansible all -m wait_for_connection -a 'delay=60 timeout=600' # delay allows node to go down
157+ ansible-playbook -v ansible/site.yml
171158 ansible-playbook -v ansible/ci/check_slurm.yml
172159
173160 - name : Check sacct state survived reimage
174161 run : |
175162 . venv/bin/activate
176- . environments/${{ matrix.cloud }} /activate
163+ . environments/.stackhpc /activate
177164 ansible-playbook -vv ansible/ci/check_sacct_hpctests.yml
178165
179166 - name : Check MPI-based tests are shown in Grafana
180167 run : |
181168 . venv/bin/activate
182- . environments/${{ matrix.cloud }} /activate
169+ . environments/.stackhpc /activate
183170 ansible-playbook -vv ansible/ci/check_grafana.yml
184171
185172 - name : Delete infrastructure
186173 run : |
187174 . venv/bin/activate
188- . environments/${{ matrix.cloud }} /activate
175+ . environments/.stackhpc /activate
189176 cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
190177 terraform destroy -auto-approve
191178 if : ${{ success() || cancelled() }}
192179
193- - name : Delete images
194- run : |
195- . venv/bin/activate
196- . environments/${{ matrix.cloud }} /activate
197- ansible-playbook -vv ansible/ci/delete_images.yml
180+ # - name: Delete images
181+ # run: |
182+ # . venv/bin/activate
183+ # . environments/.stackhpc /activate
184+ # ansible-playbook -vv ansible/ci/delete_images.yml
0 commit comments