lib/gpu_stats.py

#!/usr/bin python3
""" Collects and returns Information on available GPUs.

The information returned from this module provides information for both Nvidia and AMD GPUs.
However, the information available for Nvidia is far more thorough than what is available for
AMD, where we need to plug into plaidML to pull stats. The quality of this data will vary
depending on the OS' particular OpenCL implementation.
"""

import logging
import os
import platform

from lib.utils import get_backend

if platform.system() == 'Darwin':
    import pynvx  # pylint: disable=import-error
    IS_MACOS = True
else:
    import pynvml
    IS_MACOS = False

# Limited PlaidML/AMD Stats
try:
    from lib.plaidml_tools import PlaidMLStats as plaidlib  # pylint:disable=ungrouped-imports
except ImportError:
    plaidlib = None


_EXCLUDE_DEVICES = []


def set_exclude_devices(devices):
    """ Add any explicitly selected GPU devices to the global list of devices to be excluded
    from use by Faceswap.

    Parameters
    ----------
    devices: list
        list of indices corresponding to the GPU devices connected to the computer
    """
    logger = logging.getLogger(__name__)
    logger.debug("Excluding GPU indicies: %s", devices)
    if not devices:
        return
    _EXCLUDE_DEVICES.extend(devices)


class GPUStats():
    """ Holds information and statistics about the GPU(s) available on the currently
    running system.

    Parameters
    ----------
    log: bool, optional
        Whether the class should output information to the logger. There may be occasions where the
        logger has not yet been set up when this class is queried. Attempting to log in these
        instances will raise an error. If GPU stats are being queried prior to the logger being
        available then this parameter should be set to ``False``. Otherwise set to ``True``.
        Default: ``True``
    """
    def __init__(self, log=True):
        # Logger is held internally, as we don't want to log when obtaining system stats on crash
        self._logger = logging.getLogger(__name__) if log else None
        self._log("debug", "Initializing {}".format(self.__class__.__name__))

        self._plaid = None
        self._initialized = False
        self._device_count = 0
        self._active_devices = list()
        self._handles = list()
        self._driver = None
        self._devices = list()
        self._vram = None

        self._initialize(log)

        self._driver = self._get_driver()
        self._devices = self._get_devices()
        self._vram = self._get_vram()
        if not self._active_devices:
            self._log("warning", "No GPU detected. Switching to CPU mode")
            return

        self._shutdown()
        self._log("debug", "Initialized {}".format(self.__class__.__name__))

    @property
    def device_count(self):
        """int: The number of GPU devices discovered on the system. """
        return self._device_count

    @property
    def cli_devices(self):
        """ list: List of available devices for use in faceswap's command line arguments """
        return ["{}: {}".format(idx, device) for idx, device in enumerate(self._devices)]

    @property
    def exclude_all_devices(self):
        """ bool: ``True`` if all GPU devices have been explicitly disabled otherwise ``False`` """
        return all(idx in _EXCLUDE_DEVICES for idx in range(len(self._devices)))

    @property
    def _is_plaidml(self):
        """ bool: ``True`` if the backend is plaidML otherwise ``False``. """
        return self._plaid is not None

    @property
    def sys_info(self):
        """ dict: GPU Stats that are required for system information logging.

        The dictionary contains the following data:

            **vram** (`list`): the total amount of VRAM in Megabytes for each GPU as pertaining to
            :attr:`_handles`

            **driver** (`str`): The GPU driver version that is installed on the OS

            **devices** (`list`): The device name of each GPU on the system as pertaining
            to :attr:`_handles`

            **devices_active** (`list`): The device name of each active GPU on the system as
            pertaining to :attr:`_handles`
        """
        return dict(vram=self._vram,
                    driver=self._driver,
                    devices=self._devices,
                    devices_active=self._active_devices)

    def _log(self, level, message):
        """ If the class has been initialized with :attr:`log` as `True` then log the message
        otherwise skip logging.

        Parameters
        ----------
        level: str
            The log level to log at
        message: str
            The message to log
        """
        if self._logger is None:
            return
        logger = getattr(self._logger, level.lower())
        logger(message)

    def _initialize(self, log=False):
        """ Initialize the library that will be returning stats for the system's GPU(s).
        For Nvidia (on Linux and Windows) the library is `pynvml`. For Nvidia (on macOS) the
        library is `pynvx`. For AMD `plaidML` is used.

        Parameters
        ----------
        log: bool, optional
            Whether the class should output information to the logger. There may be occasions where
            the logger has not yet been set up when this class is queried. Attempting to log in
            these instances will raise an error. If GPU stats are being queried prior to the
            logger being available then this parameter should be set to ``False``. Otherwise set
            to ``True``. Default: ``False``
        """
        if not self._initialized:
            if get_backend() == "amd":
                self._log("debug", "AMD Detected. Using plaidMLStats")
                loglevel = "INFO" if self._logger is None else self._logger.getEffectiveLevel()
                self._plaid = plaidlib(log_level=loglevel, log=log)
            elif IS_MACOS:
                self._log("debug", "macOS Detected. Using pynvx")
                try:
                    pynvx.cudaInit()
                except RuntimeError:
                    self._initialized = True
                    return
            else:
                try:
                    self._log("debug", "OS is not macOS. Trying pynvml")
                    pynvml.nvmlInit()
                except (pynvml.NVMLError_LibraryNotFound,  # pylint: disable=no-member
                        pynvml.NVMLError_DriverNotLoaded,  # pylint: disable=no-member
                        pynvml.NVMLError_NoPermission) as err:  # pylint: disable=no-member
                    if plaidlib is not None:
                        self._log("debug", "pynvml errored. Trying plaidML")
                        self._plaid = plaidlib(log=log)
                    else:
                        msg = ("There was an error reading from the Nvidia Machine Learning "
                               "Library. Either you do not have an Nvidia GPU (in which case "
                               "this warning can be ignored) or the most likely cause is "
                               "incorrectly installed drivers. If this is the case, Please remove "
                               "and reinstall your Nvidia drivers before reporting."
                               "Original Error: {}".format(str(err)))
                        self._log("warning", msg)
                        self._initialized = True
                        return
                except Exception as err:  # pylint: disable=broad-except
                    msg = ("An unhandled exception occured loading pynvml. "
                           "Original error: {}".format(str(err)))
                    if self._logger:
                        self._logger.error(msg)
                    else:
                        print(msg)
                    self._initialized = True
                    return
            self._initialized = True
            self._get_device_count()
            self._get_active_devices()
            self._get_handles()

    def _shutdown(self):
        """ Shutdown pynvml if it was the library used for obtaining stats and set
        :attr:`_initialized` back to ``False``. """
        if self._initialized:
            self._handles = list()
            if not IS_MACOS and not self._is_plaidml:
                pynvml.nvmlShutdown()
            self._initialized = False

    def _get_device_count(self):
        """ Detect the number of GPUs attached to the system and allocate to
        :attr:`_device_count`. """
        if self._is_plaidml:
            self._device_count = self._plaid.device_count
        elif IS_MACOS:
            self._device_count = pynvx.cudaDeviceGetCount(ignore=True)
        else:
            try:
                self._device_count = pynvml.nvmlDeviceGetCount()
            except pynvml.NVMLError:
                self._device_count = 0
        self._log("debug", "GPU Device count: {}".format(self._device_count))

    def _get_active_devices(self):
        """ Obtain the indices of active GPUs (those that have not been explicitly excluded by
        CUDA_VISIBLE_DEVICES, plaidML or command line arguments) and allocate to
        :attr:`_active_devices`. """
        if self._is_plaidml:
            self._active_devices = self._plaid.active_devices
        else:
            if self._device_count == 0:
                self._active_devices = []
            else:
                devices = [idx for idx in range(self._device_count) if idx not in _EXCLUDE_DEVICES]
                env_devices = os.environ.get("CUDA_VISIBLE_DEVICES", "")
                if env_devices:
                    env_devices = [int(i) for i in env_devices.split(",")]
                    devices = [idx for idx in devices if idx in env_devices]
                self._active_devices = devices
        self._log("debug", "Active GPU Devices: {}".format(self._active_devices))

    def _get_handles(self):
        """ Obtain the internal handle identifiers for the system GPUs and allocate to
        :attr:`_handles`. """
        if self._is_plaidml:
            self._handles = self._plaid.devices
        elif IS_MACOS:
            self._handles = pynvx.cudaDeviceGetHandles(ignore=True)
        else:
            self._handles = [pynvml.nvmlDeviceGetHandleByIndex(i)
                             for i in range(self._device_count)]
        self._log("debug", "GPU Handles found: {}".format(len(self._handles)))

    def _get_driver(self):
        """ Obtain and return the installed driver version for the system's GPUs.

        Returns
        -------
        str
            The currently installed GPU driver version
        """
        if self._is_plaidml:
            driver = self._plaid.drivers
        elif IS_MACOS:
            driver = pynvx.cudaSystemGetDriverVersion(ignore=True)
        else:
            try:
                driver = pynvml.nvmlSystemGetDriverVersion().decode("utf-8")
            except pynvml.NVMLError:
                driver = "No Nvidia driver found"
        self._log("debug", "GPU Driver: {}".format(driver))
        return driver

    def _get_devices(self):
        """ Obtain the name of the installed devices. The quality of this information depends on
        the backend and OS being used, but it should be sufficient for identifying cards.

        Returns
        -------
        list
            List of device names for connected GPUs as corresponding to the values in
            :attr:`_handles`
        """
        self._initialize()
        if self._device_count == 0:
            names = list()
        if self._is_plaidml:
            names = self._plaid.names
        elif IS_MACOS:
            names = [pynvx.cudaGetName(handle, ignore=True)
                     for handle in self._handles]
        else:
            names = [pynvml.nvmlDeviceGetName(handle).decode("utf-8")
                     for handle in self._handles]
        self._log("debug", "GPU Devices: {}".format(names))
        return names

    def _get_vram(self):
        """ Obtain the total VRAM in Megabytes for each connected GPU.

        Returns
        -------
        list
             List of floats containing the total amount of VRAM in Megabytes for each connected GPU
             as corresponding to the values in :attr:`_handles
        """
        self._initialize()
        if self._device_count == 0:
            vram = list()
        elif self._is_plaidml:
            vram = self._plaid.vram
        elif IS_MACOS:
            vram = [pynvx.cudaGetMemTotal(handle, ignore=True) / (1024 * 1024)
                    for handle in self._handles]
        else:
            vram = [pynvml.nvmlDeviceGetMemoryInfo(handle).total /
                    (1024 * 1024)
                    for handle in self._handles]
        self._log("debug", "GPU VRAM: {}".format(vram))
        return vram

    def _get_free_vram(self):
        """ Obtain the amount of VRAM that is available, in Megabytes, for each connected GPU.

        Returns
        -------
        list
             List of floats containing the amount of VRAM available, in Megabytes, for each
             connected GPU as corresponding to the values in :attr:`_handles

        Notes
        -----
        There is no useful way to get free VRAM on PlaidML. OpenCL loads and unloads VRAM as
        required, so this returns the total memory available per card for AMD cards, which us
        not particularly useful.

        """
        self._initialize()
        if self._is_plaidml:
            vram = self._plaid.vram
        elif IS_MACOS:
            vram = [pynvx.cudaGetMemFree(handle, ignore=True) / (1024 * 1024)
                    for handle in self._handles]
        else:
            vram = [pynvml.nvmlDeviceGetMemoryInfo(handle).free / (1024 * 1024)
                    for handle in self._handles]
        self._shutdown()
        self._log("debug", "GPU VRAM free: {}".format(vram))
        return vram

    def get_card_most_free(self):
        """ Obtain statistics for the GPU with the most available free VRAM.

        Returns
        -------
        dict
            The dictionary contains the following data:

                **card_id** (`int`):  The index of the card as pertaining to :attr:`_handles`

                **device** (`str`): The name of the device

                **free** (`float`): The amount of available VRAM on the GPU

                **total** (`float`): the total amount of VRAM on the GPU

            If a GPU is not detected then the **card_id** is returned as ``-1`` and the amount
            of free and total RAM available is fixed to 2048 Megabytes.
        """
        if len(self._active_devices) == 0:
            return {"card_id": -1,
                    "device": "No GPU devices found",
                    "free": 2048,
                    "total": 2048}
        free_vram = [self._get_free_vram()[i] for i in self._active_devices]
        vram_free = max(free_vram)
        card_id = self._active_devices[free_vram.index(vram_free)]
        retval = {"card_id": card_id,
                  "device": self._devices[card_id],
                  "free": vram_free,
                  "total": self._vram[card_id]}
        self._log("debug", "Active GPU Card with most free VRAM: {}".format(retval))
        return retval