From 8f53c057afc4106879af491e198200d575c9444f Mon Sep 17 00:00:00 2001 From: Pablo Escobar Date: Tue, 26 Oct 2021 19:09:17 +0200 Subject: [PATCH] undo latest commits --- README.md | 55 ++++++++++++++++ defaults/main.yml | 30 +++++++-- meta/main.yml | 1 + tasks/install-redhat.yml | 102 ++++++++++++++++------------- tasks/install-ubuntu-cuda-repo.yml | 35 ++++++++++ tasks/install-ubuntu.yml | 24 +------ tasks/main.yml | 13 +++- 7 files changed, 187 insertions(+), 73 deletions(-) create mode 100644 tasks/install-ubuntu-cuda-repo.yml diff --git a/README.md b/README.md index 6dc977a..6625ede 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,15 @@ An Ansible role to install the NVIDIA driver from the NVIDIA CUDA repositories. +## Requirements + +In the process of installing the NVIDIA driver, this role will reboot the nodes where it runs. +Because of this, we strongly recommend that you run `ansible-playbook` from a separate node than the GPU nodes where you are installing the driver. + +If you attempt to run Ansible on the same node where you are installing the driver, this role will either: + +* Refuse to proceed with an error like `Running reboot with local connection would reboot the control node` (if running with the `local` connection) +* Reboot the node you're running on, interrupting the playbook execution! (if running the an `ssh` connection against localhost) ## Installing @@ -11,12 +20,58 @@ This role can be installed using [Ansible Galaxy](https://galaxy.ansible.com/nvi $ ansible-galaxy install nvidia.nvidia_driver ``` +## Role variables + + +| Variable | Default value | Description | +| -------- | ------------- | ----------- | +| `nvidia_driver_package_state` | `"present"` | Package state for NVIDIA driver packages | +| `nvidia_driver_package_version` | `""` | Package version to install. Note that this should match the actual version of the deb or RPM package to be installed. | +| `nvidia_driver_persistence_mode_on` | `yes` | Whether to enable persistence mode (boolean) | +| `nvidia_driver_skip_reboot` | `no` | Whether to skip rebooting the node during the install | +| `nvidia_driver_module_file` | `"/etc/modprobe.d/nvidia.conf"` | Filename to use for NVIDIA driver parameters | +| `nvidia_driver_module_params` | `""` | Parameters to pass to the NVIDIA driver | + +### Red Hat specific variables + + +| Variable | Default value | Description | +| -------- | ------------- | ----------- | +| `epel_package` | `"https://dl.fedoraproject.org/pub/epel/epel-release-latest-{{ ansible_distribution_major_version }}.noarch.rpm"` | Package to install to enable EPEL | +| `nvidia_driver_rhel_cuda_repo_baseurl` | `"https://developer.download.nvidia.com/compute/cuda/repos/{{ _rhel_repo_dir }}/"` | Base URL to use for CUDA repo | +| `nvidia_driver_rhel_cuda_repo_gpgkey` | `"https://developer.download.nvidia.com/compute/cuda/repos/{{ _rhel_repo_dir }}/7fa2af80.pub"` | GPG key for the CUDA repo | + +### Ubuntu specific variables + +For Ubuntu installs, you have the choice of installing from the Canonical repositories and the NVIDIA CUDA repositories. + +By default, the Canonical repositories will be used, and the driver installed will be the headless server driver. + +| Variable | Default value | Description | +| -------- | ------------- | ----------- | +| `nvidia_driver_ubuntu_install_from_cuda_repo` | `no` | Flag whether to use the CUDA repo | +| `nvidia_driver_ubuntu_branch` | `450` | Driver branch to use for the install | +| `nvidia_driver_ubuntu_packages` | `["nvidia-headless-450-server", "nvidia-headless-450-utils"]` | Package names to install from Canonical repo | +| `nvidia_driver_ubuntu_cuda_repo_baseurl` | `"http://developer.download.nvidia.com/compute/cuda/repos/{{ _ubuntu_repo_dir }}"` | Base URL to use for CUDA repo | +| `nvidia_driver_ubuntu_cuda_repo_gpgkey_url` | `"https://developer.download.nvidia.com/compute/cuda/repos/{{ _ubuntu_repo_dir }}/7fa2af80.pub"` | GPG key for the CUDA repo | +| `nvidia_driver_ubuntu_cuda_repo_gpgkey_id` | `"7fa2af80"` | GPG key ID for the CUDA repo | +| `nvidia_driver_ubuntu_cuda_package` | `"cuda-drivers"` | Package name to install from CUDA repo | + +## Example playbook + +``` +- hosts: gpu_nodes + roles: + - nvidia.nvidia_driver +``` ## Supported distributions Currently, this role supports the following Linux distributions: * NVIDIA DGX OS 4 +* NVIDIA DGX OS 5 * Ubuntu 18.04 LTS +* Ubuntu 20.04 LTS * CentOS 7 * Red Hat Enterprise Linux 7 diff --git a/defaults/main.yml b/defaults/main.yml index 528d531..8d9d61e 100644 --- a/defaults/main.yml +++ b/defaults/main.yml @@ -5,13 +5,33 @@ nvidia_driver_skip_reboot: no nvidia_driver_module_file: /etc/modprobe.d/nvidia.conf nvidia_driver_module_params: '' -# RedHat family -nvidia_driver_rhel_epel_repo_baseurl: "https://download.fedoraproject.org/pub/epel/$releasever/$basearch/" -nvidia_driver_rhel_epel_repo_gpgkey: "https://epel.mirror.constant.com//RPM-GPG-KEY-EPEL-{{ ansible_distribution_major_version }}" + +############################################################################## +# RedHat family # +############################################################################## +epel_package: "https://dl.fedoraproject.org/pub/epel/epel-release-latest-{{ ansible_distribution_major_version }}.noarch.rpm" +epel_repo_key: "https://dl.fedoraproject.org/pub/epel/RPM-GPG-KEY-EPEL-{{ ansible_distribution_major_version }}" nvidia_driver_rhel_cuda_repo_baseurl: "https://developer.download.nvidia.com/compute/cuda/repos/{{ _rhel_repo_dir }}/" nvidia_driver_rhel_cuda_repo_gpgkey: "https://developer.download.nvidia.com/compute/cuda/repos/{{ _rhel_repo_dir }}/7fa2af80.pub" -# Ubuntu + +############################################################################## +# Ubuntu # +############################################################################## + +# Determine if we should install from CUDA repo instead of Canonical repos +nvidia_driver_ubuntu_install_from_cuda_repo: no + +# Installing with Canonical repositories +nvidia_driver_ubuntu_branch: "450" +nvidia_driver_ubuntu_packages: +- "nvidia-headless-{{ nvidia_driver_ubuntu_branch }}-server" +- "nvidia-utils-{{ nvidia_driver_ubuntu_branch }}-server" +- "nvidia-headless-no-dkms-{{ nvidia_driver_ubuntu_branch }}-server" +- "nvidia-kernel-source-{{ nvidia_driver_ubuntu_branch }}-server" + +# Installing with CUDA repositories nvidia_driver_ubuntu_cuda_repo_gpgkey_url: "https://developer.download.nvidia.com/compute/cuda/repos/{{ _ubuntu_repo_dir }}/7fa2af80.pub" nvidia_driver_ubuntu_cuda_repo_gpgkey_id: "7fa2af80" -nvidia_driver_ubuntu_cuda_repo_baseurl: "http://developer.download.nvidia.com/compute/cuda/repos/{{ _ubuntu_repo_dir }}" +nvidia_driver_ubuntu_cuda_repo_baseurl: "https://developer.download.nvidia.com/compute/cuda/repos/{{ _ubuntu_repo_dir }}" +nvidia_driver_ubuntu_cuda_package: "cuda-drivers" diff --git a/meta/main.yml b/meta/main.yml index 3a9017a..da7c9f0 100644 --- a/meta/main.yml +++ b/meta/main.yml @@ -11,6 +11,7 @@ galaxy_info: versions: - 'xenial' - 'bionic' + - 'focal' - name: EL versions: - '7' diff --git a/tasks/install-redhat.yml b/tasks/install-redhat.yml index 65ec4ba..16112d4 100644 --- a/tasks/install-redhat.yml +++ b/tasks/install-redhat.yml @@ -1,10 +1,44 @@ --- +# We have to do this because the CentOS mirrors don't keep kernel-headers, etc +# for older kernels. +- name: ensure we have kernel-headers installed for the current kernel + block: + - name: attempt to install kernel support packages for current version + yum: + name: + - "kernel-headers-{{ ansible_kernel }}" + - "kernel-tools-{{ ansible_kernel }}" + - "kernel-tools-libs-{{ ansible_kernel }}" + - "kernel-devel-{{ ansible_kernel }}" + - "kernel-debug-devel-{{ ansible_kernel }}" + state: present + environment: "{{proxy_env if proxy_env is defined else {}}}" + rescue: + - name: update the kernel to latest version so we have a supported version + yum: + name: + - "kernel" + - "kernel-headers" + - "kernel-tools" + - "kernel-tools-libs" + - "kernel-devel" + - "kernel-debug-devel" + state: latest + environment: "{{proxy_env if proxy_env is defined else {}}}" + - name: reboot to pick up the new kernel + reboot: + +- name: add epel repo gpg key + rpm_key: + key: "{{ epel_repo_key }}" + state: present + - name: add epel repo - yum_repository: - name: epel - description: EPEL YUM repo - baseurl: "{{ nvidia_driver_rhel_epel_repo_baseurl }}" - gpgkey: "{{ nvidia_driver_rhel_epel_repo_gpgkey }}" + become: true + yum: + name: + - "{{ epel_package }}" + state: latest environment: "{{proxy_env if proxy_env is defined else {}}}" - name: install dependencies @@ -18,47 +52,27 @@ gpgkey: "{{ nvidia_driver_rhel_cuda_repo_gpgkey }}" environment: "{{proxy_env if proxy_env is defined else {}}}" -- name: install driver packages +- name: install driver packages RHEL/CentOS 7 and older yum: name: "{{ nvidia_driver_package_version | ternary('nvidia-driver-latest-dkms-'+nvidia_driver_package_version, 'nvidia-driver-latest-dkms') }}" state: "{{ nvidia_driver_package_state }}" autoremove: "{{ nvidia_driver_package_state == 'absent' }}" + register: install_driver_rhel7 + environment: "{{proxy_env if proxy_env is defined else {}}}" + when: ansible_distribution_major_version < '8' + +- name: install driver packages RHEL/CentOS 8 and newer + dnf: + name: "{{ nvidia_driver_package_version | ternary('@nvidia-driver:'+nvidia_driver_package_version, '@nvidia-driver:latest-dkms') }}" + state: "{{ nvidia_driver_package_state }}" + autoremove: "{{ nvidia_driver_package_state == 'absent' }}" + register: install_driver_rhel8 + environment: "{{proxy_env if proxy_env is defined else {}}}" + when: ansible_distribution_major_version > '7' + +- name: Set install_driver.changed var for RHEL 7/8 + debug: + msg: Driver installed for RHEL + when: install_driver_rhel7.changed or install_driver_rhel8.changed register: install_driver - environment: "{{proxy_env if proxy_env is defined else {}}}" - -# The driver package pulls in the latest kernel-headers package, but not the -# latest kernel. Check to see if there is a mismatch. - -- name: check kernel versions - yum: - list: kernel - register: yum_list - -- name: register installed kernel version - debug: - msg: "{{ yum_list.results | selectattr('yumstate', 'equalto', 'installed') | list }}" - register: kernel_version - -- name: check kernel-headers versions - yum: - list: kernel-headers - register: yum_list - -- name: register installed kernel-headers version - debug: - msg: "{{ yum_list.results | selectattr('yumstate', 'equalto', 'installed') | list }}" - register: kernel_headers_version - -- name: update kernel if headers don't match - yum: - name: - - kernel - - kernel-tools - - kernel-tools-libs - - kernel-devel - - kernel-debug-devel - - kernel-headers - state: latest - register: kernel_update - when: kernel_version.msg[0].release != kernel_headers_version.msg[0].release - environment: "{{proxy_env if proxy_env is defined else {}}}" + changed_when: install_driver_rhel7.changed or install_driver_rhel8.changed diff --git a/tasks/install-ubuntu-cuda-repo.yml b/tasks/install-ubuntu-cuda-repo.yml new file mode 100644 index 0000000..7aab994 --- /dev/null +++ b/tasks/install-ubuntu-cuda-repo.yml @@ -0,0 +1,35 @@ +--- +- name: remove ppa + apt_repository: + repo: ppa:graphics-drivers/ppa + state: absent + +- name: add pin file + copy: + src: "cuda-ubuntu.pin" + dest: "/etc/apt/preferences.d/cuda-repository-pin-600" + owner: "root" + group: "root" + mode: "0644" + +- name: add key + apt_key: + url: "{{ nvidia_driver_ubuntu_cuda_repo_gpgkey_url }}" + id: "{{ nvidia_driver_ubuntu_cuda_repo_gpgkey_id }}" + environment: "{{proxy_env if proxy_env is defined else {}}}" + + +- name: add repo + apt_repository: + repo: "deb {{ nvidia_driver_ubuntu_cuda_repo_baseurl }} /" + update_cache: yes + environment: "{{proxy_env if proxy_env is defined else {}}}" + +- name: install driver packages + apt: + name: "{{ nvidia_driver_package_version | ternary(nvidia_driver_ubuntu_cuda_package+'='+nvidia_driver_package_version, nvidia_driver_ubuntu_cuda_package) }}" + state: "{{ nvidia_driver_package_state }}" + autoremove: "{{ nvidia_driver_package_state == 'absent' }}" + purge: "{{ nvidia_driver_package_state == 'absent' }}" + register: install_driver + environment: "{{proxy_env if proxy_env is defined else {}}}" diff --git a/tasks/install-ubuntu.yml b/tasks/install-ubuntu.yml index 7bc40b4..bcd0a39 100644 --- a/tasks/install-ubuntu.yml +++ b/tasks/install-ubuntu.yml @@ -4,32 +4,12 @@ repo: ppa:graphics-drivers/ppa state: absent -- name: add pin file - copy: - src: "cuda-ubuntu.pin" - dest: "/etc/apt/preferences.d/cuda-repository-pin-600" - owner: "root" - group: "root" - mode: "0644" - -- name: add key - apt_key: - url: "{{ nvidia_driver_ubuntu_cuda_repo_gpgkey_url }}" - id: "{{ nvidia_driver_ubuntu_cuda_repo_gpgkey_id }}" - environment: "{{proxy_env if proxy_env is defined else {}}}" - - -- name: add repo - apt_repository: - repo: "deb {{ nvidia_driver_ubuntu_cuda_repo_baseurl }} /" - update_cache: yes - environment: "{{proxy_env if proxy_env is defined else {}}}" - - name: install driver packages apt: - name: "{{ nvidia_driver_package_version | ternary('cuda-drivers='+nvidia_driver_package_version, 'cuda-drivers') }}" + name: "{{ nvidia_driver_package_version | ternary(item+'='+nvidia_driver_package_version, item) }}" state: "{{ nvidia_driver_package_state }}" autoremove: "{{ nvidia_driver_package_state == 'absent' }}" purge: "{{ nvidia_driver_package_state == 'absent' }}" + with_items: "{{ nvidia_driver_ubuntu_packages }}" register: install_driver environment: "{{proxy_env if proxy_env is defined else {}}}" diff --git a/tasks/main.yml b/tasks/main.yml index 9ab9c4f..417f8c5 100644 --- a/tasks/main.yml +++ b/tasks/main.yml @@ -5,9 +5,18 @@ state: absent ignore_errors: true -- name: ubuntu install tasks +- name: Blacklist the nouveau driver module + community.general.kernel_blacklist: + name: nouveau + state: present + +- name: ubuntu install tasks (canonical repos) include_tasks: install-ubuntu.yml - when: ansible_distribution == 'Ubuntu' + when: ansible_distribution == 'Ubuntu' and (not nvidia_driver_ubuntu_install_from_cuda_repo) + +- name: ubuntu install tasks (CUDA repo) + include_tasks: install-ubuntu-cuda-repo.yml + when: ansible_distribution == 'Ubuntu' and nvidia_driver_ubuntu_install_from_cuda_repo - name: redhat family install tasks include_tasks: install-redhat.yml