undo latest commits
This commit is contained in:
parent
07739ff434
commit
8f53c057af
55
README.md
55
README.md
@ -2,6 +2,15 @@
|
||||
|
||||
An Ansible role to install the NVIDIA driver from the NVIDIA CUDA repositories.
|
||||
|
||||
## Requirements
|
||||
|
||||
In the process of installing the NVIDIA driver, this role will reboot the nodes where it runs.
|
||||
Because of this, we strongly recommend that you run `ansible-playbook` from a separate node than the GPU nodes where you are installing the driver.
|
||||
|
||||
If you attempt to run Ansible on the same node where you are installing the driver, this role will either:
|
||||
|
||||
* Refuse to proceed with an error like `Running reboot with local connection would reboot the control node` (if running with the `local` connection)
|
||||
* Reboot the node you're running on, interrupting the playbook execution! (if running the an `ssh` connection against localhost)
|
||||
|
||||
## Installing
|
||||
|
||||
@ -11,12 +20,58 @@ This role can be installed using [Ansible Galaxy](https://galaxy.ansible.com/nvi
|
||||
$ ansible-galaxy install nvidia.nvidia_driver
|
||||
```
|
||||
|
||||
## Role variables
|
||||
|
||||
|
||||
| Variable | Default value | Description |
|
||||
| -------- | ------------- | ----------- |
|
||||
| `nvidia_driver_package_state` | `"present"` | Package state for NVIDIA driver packages |
|
||||
| `nvidia_driver_package_version` | `""` | Package version to install. Note that this should match the actual version of the deb or RPM package to be installed. |
|
||||
| `nvidia_driver_persistence_mode_on` | `yes` | Whether to enable persistence mode (boolean) |
|
||||
| `nvidia_driver_skip_reboot` | `no` | Whether to skip rebooting the node during the install |
|
||||
| `nvidia_driver_module_file` | `"/etc/modprobe.d/nvidia.conf"` | Filename to use for NVIDIA driver parameters |
|
||||
| `nvidia_driver_module_params` | `""` | Parameters to pass to the NVIDIA driver |
|
||||
|
||||
### Red Hat specific variables
|
||||
|
||||
|
||||
| Variable | Default value | Description |
|
||||
| -------- | ------------- | ----------- |
|
||||
| `epel_package` | `"https://dl.fedoraproject.org/pub/epel/epel-release-latest-{{ ansible_distribution_major_version }}.noarch.rpm"` | Package to install to enable EPEL |
|
||||
| `nvidia_driver_rhel_cuda_repo_baseurl` | `"https://developer.download.nvidia.com/compute/cuda/repos/{{ _rhel_repo_dir }}/"` | Base URL to use for CUDA repo |
|
||||
| `nvidia_driver_rhel_cuda_repo_gpgkey` | `"https://developer.download.nvidia.com/compute/cuda/repos/{{ _rhel_repo_dir }}/7fa2af80.pub"` | GPG key for the CUDA repo |
|
||||
|
||||
### Ubuntu specific variables
|
||||
|
||||
For Ubuntu installs, you have the choice of installing from the Canonical repositories and the NVIDIA CUDA repositories.
|
||||
|
||||
By default, the Canonical repositories will be used, and the driver installed will be the headless server driver.
|
||||
|
||||
| Variable | Default value | Description |
|
||||
| -------- | ------------- | ----------- |
|
||||
| `nvidia_driver_ubuntu_install_from_cuda_repo` | `no` | Flag whether to use the CUDA repo |
|
||||
| `nvidia_driver_ubuntu_branch` | `450` | Driver branch to use for the install |
|
||||
| `nvidia_driver_ubuntu_packages` | `["nvidia-headless-450-server", "nvidia-headless-450-utils"]` | Package names to install from Canonical repo |
|
||||
| `nvidia_driver_ubuntu_cuda_repo_baseurl` | `"http://developer.download.nvidia.com/compute/cuda/repos/{{ _ubuntu_repo_dir }}"` | Base URL to use for CUDA repo |
|
||||
| `nvidia_driver_ubuntu_cuda_repo_gpgkey_url` | `"https://developer.download.nvidia.com/compute/cuda/repos/{{ _ubuntu_repo_dir }}/7fa2af80.pub"` | GPG key for the CUDA repo |
|
||||
| `nvidia_driver_ubuntu_cuda_repo_gpgkey_id` | `"7fa2af80"` | GPG key ID for the CUDA repo |
|
||||
| `nvidia_driver_ubuntu_cuda_package` | `"cuda-drivers"` | Package name to install from CUDA repo |
|
||||
|
||||
## Example playbook
|
||||
|
||||
```
|
||||
- hosts: gpu_nodes
|
||||
roles:
|
||||
- nvidia.nvidia_driver
|
||||
```
|
||||
|
||||
## Supported distributions
|
||||
|
||||
Currently, this role supports the following Linux distributions:
|
||||
|
||||
* NVIDIA DGX OS 4
|
||||
* NVIDIA DGX OS 5
|
||||
* Ubuntu 18.04 LTS
|
||||
* Ubuntu 20.04 LTS
|
||||
* CentOS 7
|
||||
* Red Hat Enterprise Linux 7
|
||||
|
@ -5,13 +5,33 @@ nvidia_driver_skip_reboot: no
|
||||
nvidia_driver_module_file: /etc/modprobe.d/nvidia.conf
|
||||
nvidia_driver_module_params: ''
|
||||
|
||||
# RedHat family
|
||||
nvidia_driver_rhel_epel_repo_baseurl: "https://download.fedoraproject.org/pub/epel/$releasever/$basearch/"
|
||||
nvidia_driver_rhel_epel_repo_gpgkey: "https://epel.mirror.constant.com//RPM-GPG-KEY-EPEL-{{ ansible_distribution_major_version }}"
|
||||
|
||||
##############################################################################
|
||||
# RedHat family #
|
||||
##############################################################################
|
||||
epel_package: "https://dl.fedoraproject.org/pub/epel/epel-release-latest-{{ ansible_distribution_major_version }}.noarch.rpm"
|
||||
epel_repo_key: "https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-{{ ansible_distribution_major_version }}"
|
||||
nvidia_driver_rhel_cuda_repo_baseurl: "https://developer.download.nvidia.com/compute/cuda/repos/{{ _rhel_repo_dir }}/"
|
||||
nvidia_driver_rhel_cuda_repo_gpgkey: "https://developer.download.nvidia.com/compute/cuda/repos/{{ _rhel_repo_dir }}/7fa2af80.pub"
|
||||
|
||||
# Ubuntu
|
||||
|
||||
##############################################################################
|
||||
# Ubuntu #
|
||||
##############################################################################
|
||||
|
||||
# Determine if we should install from CUDA repo instead of Canonical repos
|
||||
nvidia_driver_ubuntu_install_from_cuda_repo: no
|
||||
|
||||
# Installing with Canonical repositories
|
||||
nvidia_driver_ubuntu_branch: "450"
|
||||
nvidia_driver_ubuntu_packages:
|
||||
- "nvidia-headless-{{ nvidia_driver_ubuntu_branch }}-server"
|
||||
- "nvidia-utils-{{ nvidia_driver_ubuntu_branch }}-server"
|
||||
- "nvidia-headless-no-dkms-{{ nvidia_driver_ubuntu_branch }}-server"
|
||||
- "nvidia-kernel-source-{{ nvidia_driver_ubuntu_branch }}-server"
|
||||
|
||||
# Installing with CUDA repositories
|
||||
nvidia_driver_ubuntu_cuda_repo_gpgkey_url: "https://developer.download.nvidia.com/compute/cuda/repos/{{ _ubuntu_repo_dir }}/7fa2af80.pub"
|
||||
nvidia_driver_ubuntu_cuda_repo_gpgkey_id: "7fa2af80"
|
||||
nvidia_driver_ubuntu_cuda_repo_baseurl: "http://developer.download.nvidia.com/compute/cuda/repos/{{ _ubuntu_repo_dir }}"
|
||||
nvidia_driver_ubuntu_cuda_repo_baseurl: "https://developer.download.nvidia.com/compute/cuda/repos/{{ _ubuntu_repo_dir }}"
|
||||
nvidia_driver_ubuntu_cuda_package: "cuda-drivers"
|
||||
|
@ -11,6 +11,7 @@ galaxy_info:
|
||||
versions:
|
||||
- 'xenial'
|
||||
- 'bionic'
|
||||
- 'focal'
|
||||
- name: EL
|
||||
versions:
|
||||
- '7'
|
||||
|
@ -1,10 +1,44 @@
|
||||
---
|
||||
# We have to do this because the CentOS mirrors don't keep kernel-headers, etc
|
||||
# for older kernels.
|
||||
- name: ensure we have kernel-headers installed for the current kernel
|
||||
block:
|
||||
- name: attempt to install kernel support packages for current version
|
||||
yum:
|
||||
name:
|
||||
- "kernel-headers-{{ ansible_kernel }}"
|
||||
- "kernel-tools-{{ ansible_kernel }}"
|
||||
- "kernel-tools-libs-{{ ansible_kernel }}"
|
||||
- "kernel-devel-{{ ansible_kernel }}"
|
||||
- "kernel-debug-devel-{{ ansible_kernel }}"
|
||||
state: present
|
||||
environment: "{{proxy_env if proxy_env is defined else {}}}"
|
||||
rescue:
|
||||
- name: update the kernel to latest version so we have a supported version
|
||||
yum:
|
||||
name:
|
||||
- "kernel"
|
||||
- "kernel-headers"
|
||||
- "kernel-tools"
|
||||
- "kernel-tools-libs"
|
||||
- "kernel-devel"
|
||||
- "kernel-debug-devel"
|
||||
state: latest
|
||||
environment: "{{proxy_env if proxy_env is defined else {}}}"
|
||||
- name: reboot to pick up the new kernel
|
||||
reboot:
|
||||
|
||||
- name: add epel repo gpg key
|
||||
rpm_key:
|
||||
key: "{{ epel_repo_key }}"
|
||||
state: present
|
||||
|
||||
- name: add epel repo
|
||||
yum_repository:
|
||||
name: epel
|
||||
description: EPEL YUM repo
|
||||
baseurl: "{{ nvidia_driver_rhel_epel_repo_baseurl }}"
|
||||
gpgkey: "{{ nvidia_driver_rhel_epel_repo_gpgkey }}"
|
||||
become: true
|
||||
yum:
|
||||
name:
|
||||
- "{{ epel_package }}"
|
||||
state: latest
|
||||
environment: "{{proxy_env if proxy_env is defined else {}}}"
|
||||
|
||||
- name: install dependencies
|
||||
@ -18,47 +52,27 @@
|
||||
gpgkey: "{{ nvidia_driver_rhel_cuda_repo_gpgkey }}"
|
||||
environment: "{{proxy_env if proxy_env is defined else {}}}"
|
||||
|
||||
- name: install driver packages
|
||||
- name: install driver packages RHEL/CentOS 7 and older
|
||||
yum:
|
||||
name: "{{ nvidia_driver_package_version | ternary('nvidia-driver-latest-dkms-'+nvidia_driver_package_version, 'nvidia-driver-latest-dkms') }}"
|
||||
state: "{{ nvidia_driver_package_state }}"
|
||||
autoremove: "{{ nvidia_driver_package_state == 'absent' }}"
|
||||
register: install_driver_rhel7
|
||||
environment: "{{proxy_env if proxy_env is defined else {}}}"
|
||||
when: ansible_distribution_major_version < '8'
|
||||
|
||||
- name: install driver packages RHEL/CentOS 8 and newer
|
||||
dnf:
|
||||
name: "{{ nvidia_driver_package_version | ternary('@nvidia-driver:'+nvidia_driver_package_version, '@nvidia-driver:latest-dkms') }}"
|
||||
state: "{{ nvidia_driver_package_state }}"
|
||||
autoremove: "{{ nvidia_driver_package_state == 'absent' }}"
|
||||
register: install_driver_rhel8
|
||||
environment: "{{proxy_env if proxy_env is defined else {}}}"
|
||||
when: ansible_distribution_major_version > '7'
|
||||
|
||||
- name: Set install_driver.changed var for RHEL 7/8
|
||||
debug:
|
||||
msg: Driver installed for RHEL
|
||||
when: install_driver_rhel7.changed or install_driver_rhel8.changed
|
||||
register: install_driver
|
||||
environment: "{{proxy_env if proxy_env is defined else {}}}"
|
||||
|
||||
# The driver package pulls in the latest kernel-headers package, but not the
|
||||
# latest kernel. Check to see if there is a mismatch.
|
||||
|
||||
- name: check kernel versions
|
||||
yum:
|
||||
list: kernel
|
||||
register: yum_list
|
||||
|
||||
- name: register installed kernel version
|
||||
debug:
|
||||
msg: "{{ yum_list.results | selectattr('yumstate', 'equalto', 'installed') | list }}"
|
||||
register: kernel_version
|
||||
|
||||
- name: check kernel-headers versions
|
||||
yum:
|
||||
list: kernel-headers
|
||||
register: yum_list
|
||||
|
||||
- name: register installed kernel-headers version
|
||||
debug:
|
||||
msg: "{{ yum_list.results | selectattr('yumstate', 'equalto', 'installed') | list }}"
|
||||
register: kernel_headers_version
|
||||
|
||||
- name: update kernel if headers don't match
|
||||
yum:
|
||||
name:
|
||||
- kernel
|
||||
- kernel-tools
|
||||
- kernel-tools-libs
|
||||
- kernel-devel
|
||||
- kernel-debug-devel
|
||||
- kernel-headers
|
||||
state: latest
|
||||
register: kernel_update
|
||||
when: kernel_version.msg[0].release != kernel_headers_version.msg[0].release
|
||||
environment: "{{proxy_env if proxy_env is defined else {}}}"
|
||||
changed_when: install_driver_rhel7.changed or install_driver_rhel8.changed
|
||||
|
35
tasks/install-ubuntu-cuda-repo.yml
Normal file
35
tasks/install-ubuntu-cuda-repo.yml
Normal file
@ -0,0 +1,35 @@
|
||||
---
|
||||
- name: remove ppa
|
||||
apt_repository:
|
||||
repo: ppa:graphics-drivers/ppa
|
||||
state: absent
|
||||
|
||||
- name: add pin file
|
||||
copy:
|
||||
src: "cuda-ubuntu.pin"
|
||||
dest: "/etc/apt/preferences.d/cuda-repository-pin-600"
|
||||
owner: "root"
|
||||
group: "root"
|
||||
mode: "0644"
|
||||
|
||||
- name: add key
|
||||
apt_key:
|
||||
url: "{{ nvidia_driver_ubuntu_cuda_repo_gpgkey_url }}"
|
||||
id: "{{ nvidia_driver_ubuntu_cuda_repo_gpgkey_id }}"
|
||||
environment: "{{proxy_env if proxy_env is defined else {}}}"
|
||||
|
||||
|
||||
- name: add repo
|
||||
apt_repository:
|
||||
repo: "deb {{ nvidia_driver_ubuntu_cuda_repo_baseurl }} /"
|
||||
update_cache: yes
|
||||
environment: "{{proxy_env if proxy_env is defined else {}}}"
|
||||
|
||||
- name: install driver packages
|
||||
apt:
|
||||
name: "{{ nvidia_driver_package_version | ternary(nvidia_driver_ubuntu_cuda_package+'='+nvidia_driver_package_version, nvidia_driver_ubuntu_cuda_package) }}"
|
||||
state: "{{ nvidia_driver_package_state }}"
|
||||
autoremove: "{{ nvidia_driver_package_state == 'absent' }}"
|
||||
purge: "{{ nvidia_driver_package_state == 'absent' }}"
|
||||
register: install_driver
|
||||
environment: "{{proxy_env if proxy_env is defined else {}}}"
|
@ -4,32 +4,12 @@
|
||||
repo: ppa:graphics-drivers/ppa
|
||||
state: absent
|
||||
|
||||
- name: add pin file
|
||||
copy:
|
||||
src: "cuda-ubuntu.pin"
|
||||
dest: "/etc/apt/preferences.d/cuda-repository-pin-600"
|
||||
owner: "root"
|
||||
group: "root"
|
||||
mode: "0644"
|
||||
|
||||
- name: add key
|
||||
apt_key:
|
||||
url: "{{ nvidia_driver_ubuntu_cuda_repo_gpgkey_url }}"
|
||||
id: "{{ nvidia_driver_ubuntu_cuda_repo_gpgkey_id }}"
|
||||
environment: "{{proxy_env if proxy_env is defined else {}}}"
|
||||
|
||||
|
||||
- name: add repo
|
||||
apt_repository:
|
||||
repo: "deb {{ nvidia_driver_ubuntu_cuda_repo_baseurl }} /"
|
||||
update_cache: yes
|
||||
environment: "{{proxy_env if proxy_env is defined else {}}}"
|
||||
|
||||
- name: install driver packages
|
||||
apt:
|
||||
name: "{{ nvidia_driver_package_version | ternary('cuda-drivers='+nvidia_driver_package_version, 'cuda-drivers') }}"
|
||||
name: "{{ nvidia_driver_package_version | ternary(item+'='+nvidia_driver_package_version, item) }}"
|
||||
state: "{{ nvidia_driver_package_state }}"
|
||||
autoremove: "{{ nvidia_driver_package_state == 'absent' }}"
|
||||
purge: "{{ nvidia_driver_package_state == 'absent' }}"
|
||||
with_items: "{{ nvidia_driver_ubuntu_packages }}"
|
||||
register: install_driver
|
||||
environment: "{{proxy_env if proxy_env is defined else {}}}"
|
||||
|
@ -5,9 +5,18 @@
|
||||
state: absent
|
||||
ignore_errors: true
|
||||
|
||||
- name: ubuntu install tasks
|
||||
- name: Blacklist the nouveau driver module
|
||||
community.general.kernel_blacklist:
|
||||
name: nouveau
|
||||
state: present
|
||||
|
||||
- name: ubuntu install tasks (canonical repos)
|
||||
include_tasks: install-ubuntu.yml
|
||||
when: ansible_distribution == 'Ubuntu'
|
||||
when: ansible_distribution == 'Ubuntu' and (not nvidia_driver_ubuntu_install_from_cuda_repo)
|
||||
|
||||
- name: ubuntu install tasks (CUDA repo)
|
||||
include_tasks: install-ubuntu-cuda-repo.yml
|
||||
when: ansible_distribution == 'Ubuntu' and nvidia_driver_ubuntu_install_from_cuda_repo
|
||||
|
||||
- name: redhat family install tasks
|
||||
include_tasks: install-redhat.yml
|
||||
|
Loading…
Reference in New Issue
Block a user