undo latest commits

This commit is contained in:
Pablo Escobar 2021-10-26 19:09:17 +02:00
parent 07739ff434
commit 8f53c057af
7 changed files with 187 additions and 73 deletions

View File

@ -2,6 +2,15 @@
An Ansible role to install the NVIDIA driver from the NVIDIA CUDA repositories.
## Requirements
In the process of installing the NVIDIA driver, this role will reboot the nodes where it runs.
Because of this, we strongly recommend that you run `ansible-playbook` from a separate node than the GPU nodes where you are installing the driver.
If you attempt to run Ansible on the same node where you are installing the driver, this role will either:
* Refuse to proceed with an error like `Running reboot with local connection would reboot the control node` (if running with the `local` connection)
* Reboot the node you're running on, interrupting the playbook execution! (if running the an `ssh` connection against localhost)
## Installing
@ -11,12 +20,58 @@ This role can be installed using [Ansible Galaxy](https://galaxy.ansible.com/nvi
$ ansible-galaxy install nvidia.nvidia_driver
```
## Role variables
| Variable | Default value | Description |
| -------- | ------------- | ----------- |
| `nvidia_driver_package_state` | `"present"` | Package state for NVIDIA driver packages |
| `nvidia_driver_package_version` | `""` | Package version to install. Note that this should match the actual version of the deb or RPM package to be installed. |
| `nvidia_driver_persistence_mode_on` | `yes` | Whether to enable persistence mode (boolean) |
| `nvidia_driver_skip_reboot` | `no` | Whether to skip rebooting the node during the install |
| `nvidia_driver_module_file` | `"/etc/modprobe.d/nvidia.conf"` | Filename to use for NVIDIA driver parameters |
| `nvidia_driver_module_params` | `""` | Parameters to pass to the NVIDIA driver |
### Red Hat specific variables
| Variable | Default value | Description |
| -------- | ------------- | ----------- |
| `epel_package` | `"https://dl.fedoraproject.org/pub/epel/epel-release-latest-{{ ansible_distribution_major_version }}.noarch.rpm"` | Package to install to enable EPEL |
| `nvidia_driver_rhel_cuda_repo_baseurl` | `"https://developer.download.nvidia.com/compute/cuda/repos/{{ _rhel_repo_dir }}/"` | Base URL to use for CUDA repo |
| `nvidia_driver_rhel_cuda_repo_gpgkey` | `"https://developer.download.nvidia.com/compute/cuda/repos/{{ _rhel_repo_dir }}/7fa2af80.pub"` | GPG key for the CUDA repo |
### Ubuntu specific variables
For Ubuntu installs, you have the choice of installing from the Canonical repositories and the NVIDIA CUDA repositories.
By default, the Canonical repositories will be used, and the driver installed will be the headless server driver.
| Variable | Default value | Description |
| -------- | ------------- | ----------- |
| `nvidia_driver_ubuntu_install_from_cuda_repo` | `no` | Flag whether to use the CUDA repo |
| `nvidia_driver_ubuntu_branch` | `450` | Driver branch to use for the install |
| `nvidia_driver_ubuntu_packages` | `["nvidia-headless-450-server", "nvidia-headless-450-utils"]` | Package names to install from Canonical repo |
| `nvidia_driver_ubuntu_cuda_repo_baseurl` | `"http://developer.download.nvidia.com/compute/cuda/repos/{{ _ubuntu_repo_dir }}"` | Base URL to use for CUDA repo |
| `nvidia_driver_ubuntu_cuda_repo_gpgkey_url` | `"https://developer.download.nvidia.com/compute/cuda/repos/{{ _ubuntu_repo_dir }}/7fa2af80.pub"` | GPG key for the CUDA repo |
| `nvidia_driver_ubuntu_cuda_repo_gpgkey_id` | `"7fa2af80"` | GPG key ID for the CUDA repo |
| `nvidia_driver_ubuntu_cuda_package` | `"cuda-drivers"` | Package name to install from CUDA repo |
## Example playbook
```
- hosts: gpu_nodes
roles:
- nvidia.nvidia_driver
```
## Supported distributions
Currently, this role supports the following Linux distributions:
* NVIDIA DGX OS 4
* NVIDIA DGX OS 5
* Ubuntu 18.04 LTS
* Ubuntu 20.04 LTS
* CentOS 7
* Red Hat Enterprise Linux 7

View File

@ -5,13 +5,33 @@ nvidia_driver_skip_reboot: no
nvidia_driver_module_file: /etc/modprobe.d/nvidia.conf
nvidia_driver_module_params: ''
# RedHat family
nvidia_driver_rhel_epel_repo_baseurl: "https://download.fedoraproject.org/pub/epel/$releasever/$basearch/"
nvidia_driver_rhel_epel_repo_gpgkey: "https://epel.mirror.constant.com//RPM-GPG-KEY-EPEL-{{ ansible_distribution_major_version }}"
##############################################################################
# RedHat family #
##############################################################################
epel_package: "https://dl.fedoraproject.org/pub/epel/epel-release-latest-{{ ansible_distribution_major_version }}.noarch.rpm"
epel_repo_key: "https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-{{ ansible_distribution_major_version }}"
nvidia_driver_rhel_cuda_repo_baseurl: "https://developer.download.nvidia.com/compute/cuda/repos/{{ _rhel_repo_dir }}/"
nvidia_driver_rhel_cuda_repo_gpgkey: "https://developer.download.nvidia.com/compute/cuda/repos/{{ _rhel_repo_dir }}/7fa2af80.pub"
# Ubuntu
##############################################################################
# Ubuntu #
##############################################################################
# Determine if we should install from CUDA repo instead of Canonical repos
nvidia_driver_ubuntu_install_from_cuda_repo: no
# Installing with Canonical repositories
nvidia_driver_ubuntu_branch: "450"
nvidia_driver_ubuntu_packages:
- "nvidia-headless-{{ nvidia_driver_ubuntu_branch }}-server"
- "nvidia-utils-{{ nvidia_driver_ubuntu_branch }}-server"
- "nvidia-headless-no-dkms-{{ nvidia_driver_ubuntu_branch }}-server"
- "nvidia-kernel-source-{{ nvidia_driver_ubuntu_branch }}-server"
# Installing with CUDA repositories
nvidia_driver_ubuntu_cuda_repo_gpgkey_url: "https://developer.download.nvidia.com/compute/cuda/repos/{{ _ubuntu_repo_dir }}/7fa2af80.pub"
nvidia_driver_ubuntu_cuda_repo_gpgkey_id: "7fa2af80"
nvidia_driver_ubuntu_cuda_repo_baseurl: "http://developer.download.nvidia.com/compute/cuda/repos/{{ _ubuntu_repo_dir }}"
nvidia_driver_ubuntu_cuda_repo_baseurl: "https://developer.download.nvidia.com/compute/cuda/repos/{{ _ubuntu_repo_dir }}"
nvidia_driver_ubuntu_cuda_package: "cuda-drivers"

View File

@ -11,6 +11,7 @@ galaxy_info:
versions:
- 'xenial'
- 'bionic'
- 'focal'
- name: EL
versions:
- '7'

View File

@ -1,10 +1,44 @@
---
# We have to do this because the CentOS mirrors don't keep kernel-headers, etc
# for older kernels.
- name: ensure we have kernel-headers installed for the current kernel
block:
- name: attempt to install kernel support packages for current version
yum:
name:
- "kernel-headers-{{ ansible_kernel }}"
- "kernel-tools-{{ ansible_kernel }}"
- "kernel-tools-libs-{{ ansible_kernel }}"
- "kernel-devel-{{ ansible_kernel }}"
- "kernel-debug-devel-{{ ansible_kernel }}"
state: present
environment: "{{proxy_env if proxy_env is defined else {}}}"
rescue:
- name: update the kernel to latest version so we have a supported version
yum:
name:
- "kernel"
- "kernel-headers"
- "kernel-tools"
- "kernel-tools-libs"
- "kernel-devel"
- "kernel-debug-devel"
state: latest
environment: "{{proxy_env if proxy_env is defined else {}}}"
- name: reboot to pick up the new kernel
reboot:
- name: add epel repo gpg key
rpm_key:
key: "{{ epel_repo_key }}"
state: present
- name: add epel repo
yum_repository:
name: epel
description: EPEL YUM repo
baseurl: "{{ nvidia_driver_rhel_epel_repo_baseurl }}"
gpgkey: "{{ nvidia_driver_rhel_epel_repo_gpgkey }}"
become: true
yum:
name:
- "{{ epel_package }}"
state: latest
environment: "{{proxy_env if proxy_env is defined else {}}}"
- name: install dependencies
@ -18,47 +52,27 @@
gpgkey: "{{ nvidia_driver_rhel_cuda_repo_gpgkey }}"
environment: "{{proxy_env if proxy_env is defined else {}}}"
- name: install driver packages
- name: install driver packages RHEL/CentOS 7 and older
yum:
name: "{{ nvidia_driver_package_version | ternary('nvidia-driver-latest-dkms-'+nvidia_driver_package_version, 'nvidia-driver-latest-dkms') }}"
state: "{{ nvidia_driver_package_state }}"
autoremove: "{{ nvidia_driver_package_state == 'absent' }}"
register: install_driver_rhel7
environment: "{{proxy_env if proxy_env is defined else {}}}"
when: ansible_distribution_major_version < '8'
- name: install driver packages RHEL/CentOS 8 and newer
dnf:
name: "{{ nvidia_driver_package_version | ternary('@nvidia-driver:'+nvidia_driver_package_version, '@nvidia-driver:latest-dkms') }}"
state: "{{ nvidia_driver_package_state }}"
autoremove: "{{ nvidia_driver_package_state == 'absent' }}"
register: install_driver_rhel8
environment: "{{proxy_env if proxy_env is defined else {}}}"
when: ansible_distribution_major_version > '7'
- name: Set install_driver.changed var for RHEL 7/8
debug:
msg: Driver installed for RHEL
when: install_driver_rhel7.changed or install_driver_rhel8.changed
register: install_driver
environment: "{{proxy_env if proxy_env is defined else {}}}"
# The driver package pulls in the latest kernel-headers package, but not the
# latest kernel. Check to see if there is a mismatch.
- name: check kernel versions
yum:
list: kernel
register: yum_list
- name: register installed kernel version
debug:
msg: "{{ yum_list.results | selectattr('yumstate', 'equalto', 'installed') | list }}"
register: kernel_version
- name: check kernel-headers versions
yum:
list: kernel-headers
register: yum_list
- name: register installed kernel-headers version
debug:
msg: "{{ yum_list.results | selectattr('yumstate', 'equalto', 'installed') | list }}"
register: kernel_headers_version
- name: update kernel if headers don't match
yum:
name:
- kernel
- kernel-tools
- kernel-tools-libs
- kernel-devel
- kernel-debug-devel
- kernel-headers
state: latest
register: kernel_update
when: kernel_version.msg[0].release != kernel_headers_version.msg[0].release
environment: "{{proxy_env if proxy_env is defined else {}}}"
changed_when: install_driver_rhel7.changed or install_driver_rhel8.changed

View File

@ -0,0 +1,35 @@
---
- name: remove ppa
apt_repository:
repo: ppa:graphics-drivers/ppa
state: absent
- name: add pin file
copy:
src: "cuda-ubuntu.pin"
dest: "/etc/apt/preferences.d/cuda-repository-pin-600"
owner: "root"
group: "root"
mode: "0644"
- name: add key
apt_key:
url: "{{ nvidia_driver_ubuntu_cuda_repo_gpgkey_url }}"
id: "{{ nvidia_driver_ubuntu_cuda_repo_gpgkey_id }}"
environment: "{{proxy_env if proxy_env is defined else {}}}"
- name: add repo
apt_repository:
repo: "deb {{ nvidia_driver_ubuntu_cuda_repo_baseurl }} /"
update_cache: yes
environment: "{{proxy_env if proxy_env is defined else {}}}"
- name: install driver packages
apt:
name: "{{ nvidia_driver_package_version | ternary(nvidia_driver_ubuntu_cuda_package+'='+nvidia_driver_package_version, nvidia_driver_ubuntu_cuda_package) }}"
state: "{{ nvidia_driver_package_state }}"
autoremove: "{{ nvidia_driver_package_state == 'absent' }}"
purge: "{{ nvidia_driver_package_state == 'absent' }}"
register: install_driver
environment: "{{proxy_env if proxy_env is defined else {}}}"

View File

@ -4,32 +4,12 @@
repo: ppa:graphics-drivers/ppa
state: absent
- name: add pin file
copy:
src: "cuda-ubuntu.pin"
dest: "/etc/apt/preferences.d/cuda-repository-pin-600"
owner: "root"
group: "root"
mode: "0644"
- name: add key
apt_key:
url: "{{ nvidia_driver_ubuntu_cuda_repo_gpgkey_url }}"
id: "{{ nvidia_driver_ubuntu_cuda_repo_gpgkey_id }}"
environment: "{{proxy_env if proxy_env is defined else {}}}"
- name: add repo
apt_repository:
repo: "deb {{ nvidia_driver_ubuntu_cuda_repo_baseurl }} /"
update_cache: yes
environment: "{{proxy_env if proxy_env is defined else {}}}"
- name: install driver packages
apt:
name: "{{ nvidia_driver_package_version | ternary('cuda-drivers='+nvidia_driver_package_version, 'cuda-drivers') }}"
name: "{{ nvidia_driver_package_version | ternary(item+'='+nvidia_driver_package_version, item) }}"
state: "{{ nvidia_driver_package_state }}"
autoremove: "{{ nvidia_driver_package_state == 'absent' }}"
purge: "{{ nvidia_driver_package_state == 'absent' }}"
with_items: "{{ nvidia_driver_ubuntu_packages }}"
register: install_driver
environment: "{{proxy_env if proxy_env is defined else {}}}"

View File

@ -5,9 +5,18 @@
state: absent
ignore_errors: true
- name: ubuntu install tasks
- name: Blacklist the nouveau driver module
community.general.kernel_blacklist:
name: nouveau
state: present
- name: ubuntu install tasks (canonical repos)
include_tasks: install-ubuntu.yml
when: ansible_distribution == 'Ubuntu'
when: ansible_distribution == 'Ubuntu' and (not nvidia_driver_ubuntu_install_from_cuda_repo)
- name: ubuntu install tasks (CUDA repo)
include_tasks: install-ubuntu-cuda-repo.yml
when: ansible_distribution == 'Ubuntu' and nvidia_driver_ubuntu_install_from_cuda_repo
- name: redhat family install tasks
include_tasks: install-redhat.yml