Initial commit

kcgthb · kcgthb · commit 46e722b2e2f1 · 2018-01-22T14:13:03.000-08:00
diff --git a/README.md b/README.md
@@ -0,0 +1,126 @@
+## Slurm SPANK GPU Compute Mode plugin
+
+The GPU Compute mode [SPANK](https://slurm.schedmd.com/spank.html) plugin for
+[Slurm](https://slurm.schedmd.com/) allows users to choose the compute mode of
+GPUs they submit jobs to.
+
+### Rationale
+
+NVIDIA GPUs can be set to operate under different compute modes:
+
+* `Default` (shared): Multiple host threads can use the device
+  at the same time.
+* `Exclusive-process`: Only one CUDA context may be created on
+  the device across all processes in the system.
+* `Prohibited`: No CUDA context can be created on the device.
+
+_More information is available in the [CUDA Programming
+guide](http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-modes)_
+
+
+To ensure optimal performance, process isolation and avoid situations where
+multiple processes unintentionally run on the same GPU, it's often recommended
+to set the GPU compute mode to `Exclusive-process`.
+
+Unfortunately, some legacy applications may require to run concurrent processes
+on a single GPU, to function properly,  and thus need the GPUs to be set in the
+`Default` (_i.e._ shared) compute mode.
+
+Hence the need for a mechanism that would allow users to choose the compute
+mode of the GPUs their job will run on.
+
+
+## Installation
+
+**Requirements**:
+* [`slurm-spank-lua`](https://github.com/stanford-rc/slurm-spank-lua)
+  (submission and execution nodes)
+* [`nvidia-smi`](https://developer.nvidia.com/nvidia-system-management-interface)
+(execution nodes)
+
+The Slurm SPANK GPU Compute Mode plugin is written in [Lua](http://www.lua.org)
+and require the
+[`slurm-spank-lua`](https://github.com/stanford-rc/slurm-spank-lua) plugin to
+work.
+
+
+Once the `slurm-spank-lua` plugin is installed and configured, the
+`gpu_cmode.lua` script can be dropped in the appropriate directory (by default,
+`/etc/slurm/lua.d`, or any other location specified in the `plugstack.conf`
+configuration file).
+
+
+Note that both the Slurm SPANK Lua plugin and the GPU Compute mode Lua plugin
+will need to be present on both the submission host(s) (where
+`srun`/`sbatch` commands are executed) and the execution host(s) (where the job
+actually runs).
+
+
+_Note_: The plugin defines a default GPU compute mode (`exclusive`), which is
+used to re-set the GPUs at the end of a job. The default mode can be changed
+by editing the value of `default_cmode` in the script.
+
+
+
+## Usage
+
+The Slurm SPANK GPU Compute Mode plugin introduces a new option to `srun` and
+`sbatch`: `--gpu_cmode`.
+
+```
+$ srun --help
+[...]
+      --gpu_cmode=<shared|exclusive|prohibited>
+                  Set the GPU compute mode on the allocated GPUs to
+                  shared, exclusive or prohibited. Default is
+                  exclusive
+[...]
+```
+
+### Examples
+
+##### Requesting `Default` compute mode
+  > `--gpu_cmode=shared`
+
+  ```
+  $ srun --gres gpu:1 --gpu_cmode=shared "nvidia-smi --query-gpu=compute_mode --format=csv,noheader"
+  Default
+  ```
+
+##### Requesting `Exclusive-process` compute mode
+  > `--gpu_cmode=exclusive`
+
+  ```
+  $ srun --gres gpu:1 --gpu_cmode=exclusive "nvidia-smi --query-gpu=compute_mode --format=csv,noheader"
+  Exclusive_Process
+  ```
+
+##### Requesting `prohibited` compute mode
+  > `--gpu_cmode=prohibited`
+
+  ```
+  $ srun --gres gpu:1 --gpu_cmode=prohibited "nvidia-smi --query-gpu=compute_mode --format=csv,noheader"
+  Prohibited
+  ```
+
+##### Multi-GPU job
+
+  ```
+  $ srun --gres gpu:4 --gpu_cmode=shared "nvidia-smi --query-gpu=compute_mode --format=csv,noheader"
+  Default
+  Default
+  Default
+  Default
+  ```
+
+##### Multi-node job
+
+  ```
+  $ srun -l -N 2 --ntasks-per-node=1 --gres gpu:1 --gpu_cmode=shared "nvidia-smi --query-gpu=compute_mode --format=csv,noheader"
+  1: Default
+  0: Default
+  ```
+
+**NB**: If the `--gpu_cmode` option is not used, no modification will be made
+to the current compute mode of the GPUs, and the site default will be used.
+
diff --git a/gpu_cmode.lua b/gpu_cmode.lua
@@ -0,0 +1,174 @@
+-- ============================================================================
+-- SPANK plugin to allow users to choose the compute mode on the GPUs allocated
+-- to their job. Requires `nvidia-smi` on the compute node, and the Slurm SPANK
+-- Lua plugin.
+--
+-- Adds a --gpu_cmode=MODE option to srun/sbatch/salloc, with MODE:
+--      0: shared
+--      1: exclusive (exclusive_thread: deprecated, use 3)
+--      2: prohibited
+--      3: exclusive (exclusive_process)
+--
+-- Reference:
+-- http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-modes
+--
+-- # Author   : Kilian Cavalotti <kilian@stanford.edu>
+-- # Created  : 2018/01/22
+-- # License  : GPL 2.0
+-- ============================================================================
+
+
+--
+-- constants ------------------------------------------------------------------
+--
+
+-- plugin name (for logging)
+--
+myname = "SPANK:gpu_cmode"
+
+-- GPU compute modes definitions
+--
+valid_cmodes = {
+    [0]="shared",
+    [1]="exclusive",
+    [2]="prohibited"
+}
+-- reverse index
+--
+cmodes_index = {}
+for k,v in pairs(valid_cmodes) do cmodes_index[v]=k end
+
+-- default mode
+-- GPUs will be reset to that mode at the end of the job
+--
+default_cmode = "exclusive"
+
+
+-- define new --gpu_cmode option for srun/salloc/sbatch
+--
+spank_options = {
+    {
+        name    = "gpu_cmode",
+        usage   = "Set the GPU compute mode on the allocated GPUs to " ..
+                  "shared, exclusive or prohibited. Default is " ..
+                  default_cmode ,
+        arginfo = "<shared|exclusive|prohibited>",
+        has_arg = 1,
+        cb =      "opt_handler"
+    },
+}
+
+
+--
+-- functions ------------------------------------------------------------------
+--
+
+-- execute command and return output
+--
+function exec(cmd)
+    local handle = io.popen (cmd)
+    local result = handle:read("*a") or ""
+    handle:close()
+    result = string.gsub(result, "\n$", "")
+    return result
+end
+
+-- validate compute mode
+--
+function validate_cmode(cmode)
+    for _, value in pairs(valid_cmodes) do
+        if value == cmode then
+            return true
+        end
+    end
+    return false
+end
+
+-- check options
+--
+function opt_handler(val, optarg, isremote)
+    cmode = optarg
+    if isremote or validate_cmode(optarg) then
+        return SPANK.SUCCESS
+    end
+    return SPANK.FAILURE
+end
+
+
+--
+-- SPANK functions ------------------------------------------------------------
+-- cf. https://slurm.schedmd.com/spank.html
+--
+
+-- SPANK function, called after privileges are temporarily dropped.
+-- needs to run as root, but in the job cgroup context, if any.
+--
+function slurm_spank_user_init(spank)
+
+    -- if context is not "remote" or compute mode is not defined, do nothing
+    if spank.context ~= "remote" or cmode == nil then
+        return SPANK.SUCCESS
+    end
+
+    -- get GPU ids from CUDA_VISIBLE_DEVICES
+    device_ids = spank:getenv("CUDA_VISIBLE_DEVICES")
+    if device_ids == nil or device_ids == "" then
+        SPANK.log_error(myname .. ": CUDA_VISIBLE_DEVICES not set.")
+        return SPANK.FAILURE
+    end
+
+    -- check for nvidia-smi
+    nvs_path = exec("which nvidia-smi")
+    if nvs_path:match("nvidia%-smi$") == nil then
+        SPANK.log_error(myname .. ": can't find nvidia-smi in PATH.")
+        return SPANK.FAILURE
+    end
+
+    -- set compute mode on GPUs
+    SPANK.log_info(myname .. ": changing compute mode to '%s' on GPU(s): %s\n",
+                   cmode, device_ids)
+    local cmd = nvs_path .. " -c " .. cmodes_index[cmode] ..
+                            " -i " .. device_ids
+    local ret = tonumber(os.execute(cmd))
+    SPANK.log_debug(myname .. ": DEBUG: cmd = %s\n", cmd)
+    SPANK.log_debug(myname .. ": DEBUG: ret = %s\n", ret)
+
+    -- check return code
+    if ret ~= 0 then
+        SPANK.log_error(myname .. ": error setting compute mode go to '%s'" ..
+                        " on GPU(s): %s\n", cmode, device_ids)
+        return SPANK.FAILURE
+    end
+
+    return SPANK.SUCCESS
+end
+
+
+-- SPANK function called for each task as its exit status is collected by Slurm
+-- needs to run as root, in the job cgroup context, if any.
+--
+function slurm_spank_task_exit(spank)
+
+    -- if context is not "remote" or compute mode is not defined, do nothin'
+    if spank.context ~= "remote" or cmode == nil then
+        return SPANK.SUCCESS
+    end
+
+    -- reset compute mode on GPUs
+    SPANK.log_info(myname .. ": resetting compute mode to default '%s'" ..
+                   " on GPU(s): %s\n", default_cmode, device_ids)
+    local cmd = nvs_path .. " -c " .. cmodes_index[default_cmode] ..
+                            " -i " .. device_ids
+    local ret = tonumber(os.execute(cmd))
+    SPANK.log_debug(myname .. ": DEBUG: cmd = %s\n", cmd)
+    SPANK.log_debug(myname .. ": DEBUG: ret = %s\n", ret)
+
+    -- check return
+    if ret ~= 0 then
+        SPANK.log_error(myname .. ": error resetting compute mode to default"..
+                        " '%s' on GPU(s): %s\n", default_cmode, device_ids)
+        return SPANK.FAILURE
+    end
+
+    return SPANK.SUCCESS
+end