GPU节点使用

GPU节点介绍和基本使用¶

本集群配备了2个GPU节点 gpu01、gpu02，每个节点2张GPU卡。这两个节点单独划分给了gpu队列，作业脚本中指定队列为gpu( #BSUB -q gpu )才能使用GPU节点。使用该队列需向管理员申请，每个用户最多使用5个CPU核。

显卡型号：gpu01 P100，gpu02 K40m

cuda版本:11.4，如需其它cuda版本，可使用module加载，如 module load cuda/10.2

深度学习库¶

python深度学习库集群没有预装，需要用户自行安装。这里以tensorflow为例来说明如何在集群上安装和使用深度学习库。

conda安装¶

需要先通过交互模式进入GPU节点 bsub -q gpu -Is bash

#创建conda环境，可以指定为python版本，如 conda create -n tf python=3.6
conda create -n tf

#激活环境
source activate tf

#安装TensorFlow，亦可指定版本，如 conda install tensorflow==1.14
conda install tensorflow

#测试安装效果
python -c "import tensorflow as tf;print(tf.reduce_sum(tf.random.normal([1000, 1000])))"

pip安装¶

需要先通过交互模式进入GPU节点 bsub -q gpu -Is bash

# 创建虚拟环境，其中 `./tf_venv` 为虚拟环境目录，此后所有该虚拟环境的包都会安装在该目录下
python3 -m venv  ./tf_venv

#激活环境
source ./tf_venv/bin/activate

#安装TensorFlow，可指定版本，如 pip install --upgrade tensorflow==1.15，1.15为TensorFlow 1.x的最终版本
pip install --upgrade tensorflow

#测试安装效果
python -c "import tensorflow as tf;print(tf.reduce_sum(tf.random.normal([1000, 1000])))"

singularity镜像¶

NVIDIA官方自己构建的pytorch和TensorFlow的容器镜像，每个里面包含了cuda、显卡驱动以及cudnn，据说比自己装的速度要快，使用时singularity需要 --nv 选项。

集群上下载了部分NVIDIA的官方深度学习库镜像，目录为 /share/Singularity/nvidia

使用举例

需要先通过交互模式进入GPU节点 bsub -q gpu -Is bash

module load Singularity/3.7.3

singularity exec --nv /share/Singularity/nvidia/tensorflow_20.11-tf2-py3.sif python -c "import tensorflow as tf;print(tf.reduce_sum(tf.random.normal([1000, 1000])))"

查看GPU资源使用¶

使用lsload -gpuload 命令可以查看所有GPU卡的资源使用情况。

$ lsload -gpuload
HOST_NAME       gpuid   gpu_model   gpu_mode  gpu_temp   gpu_ecc  gpu_ut  gpu_mut gpu_mtotal gpu_mused   gpu_pstate   gpu_status   gpu_error
gpu02               0   TeslaK40m        0.0       38C       0.0      0%       0%      11.9G        0M            8           ok           -
                    1   TeslaK40m        0.0       39C       0.0      0%       0%      11.9G        0M            8           ok           -
gpu01               0 TeslaP100_P        0.0       60C       0.0     93%      36%      15.8G     11.9G            0           ok           -
                    1 TeslaP100_P        0.0       66C       0.0     99%      35%      15.8G     15.2G            0           ok           -

FAQ¶

GPU卡显存不足报错，RuntimeError: CUDA error: out of memory

一般是显卡上已经有其它程序在运行占用了部分显存导致的，可手动指定程序使用其它空闲显卡来解决
```
import os 
os.environ['CUDA_VISIBLE_DEVICES'] = "0" #或1
```

gpu02上的k4m支持pytorch最高版本为1.2

micromamba create -n pytorch12 python==3.7
micromamba activate pytorch12
conda install pytorch=1.2 torchvision

速度测试¶

运行pytorch_benchmark.py

$ python pytorch_benchmark.py -h
usage: pytorch_benchmark.py [-h] [-i I] [-e E] [-bp BP] [-bs BS]

Used to check pytorch speed benchmark.

optional arguments:
    -h, --help  show this help message and exit
    -i I        Card id. Which cuda card do you want to test. default: 0
    -e E        Epoch. defaule: 500
    -bp BP      Use backward. defaule: True
    -bs BS      Batch size. defaule: 8

# -e 改变问题规模，同等规模，运行时间越短，速度越快
# -i 指定显卡id
$ python pytorch_benchmark.py -e 50 -i 0
Speed benchmark begin.
Speed benchmark finish.
Result
cuda_time: 56.0926796875
perf_counter_time: 56.09359596297145
no_num_error: True
deterministic: True
benchmark: False
platform: Linux-3.10.0-862.el7.x86_64-x86_64-with-centos-7.5.1804-Core
machine: x86_64
python_build: ('default', 'Oct  9 2018 10:31:47')
test_time: 2023-02-25T17:39:57.235769

'''
用于测试显卡速度
'''
import os
import torch
import torch.nn as nn
from torch.backends import cudnn
import argparse
import time
import datetime
import platform
os.environ["CUDA_VISIBLE_DEVICES"]="0"


def ConvBnAct(in_ch, out_ch, ker_sz, stride, pad, act=nn.Identity(), group=1, dilation=1):
    return nn.Sequential(nn.Conv2d(in_ch, out_ch, ker_sz, stride, pad, groups=group, bias=False, dilation=dilation),
                         nn.GroupNorm(16, out_ch, eps=1e-8),
                         act)


def DeConvBnAct(in_ch, out_ch, ker_sz, stride, pad, act=nn.Identity(), group=1, dilation=1):
    return nn.Sequential(nn.ConvTranspose2d(in_ch, out_ch, ker_sz, stride, pad, groups=group, bias=False, dilation=dilation),
                         nn.GroupNorm(16, out_ch, eps=1e-8),
                         act)


class RevSequential(nn.ModuleList):
    '''
    功能大部分与ModuleList重叠
    '''
    def __init__(self, modules=None):
        super().__init__(modules)

    def append(self, module):
        assert hasattr(module, 'invert') and callable(module.invert)
        super().append(module)

    def extend(self, modules):
        for m in modules:
            self.append(m)

    def forward(self, x1, x2):
        y1, y2 = x1, x2
        for m in self:
            y1, y2 = m(y1, y2)
        return y1, y2

    def invert(self, y1, y2):
        x1, x2 = y1, y2
        for m in list(self)[::-1]:
            x1, x2 = m.invert(x1, x2)
        return x1, x2


class RevGroupBlock(RevSequential):
    '''
    当前只支持输入通道等于输出通道，并且不允许下采样
    '''
    def __init__(self, in_ch, out_ch, stride, act, block_type, blocks, **kwargs):
        assert in_ch == out_ch
        assert stride == 1
        mods = []
        for _ in range(blocks):
            mods.append(block_type(in_ch=in_ch, out_ch=out_ch, stride=1, act=act, **kwargs))
        # self.extend(mods)
        super().__init__(mods)


class RevBlockC(nn.Module):
    def __init__(self, in_ch, out_ch, stride, act, **kwargs):
        super().__init__()
        inter_ch = in_ch // 2
        self.conv1 = ConvBnAct(in_ch, inter_ch, ker_sz=5, stride=1, pad=2, act=act)
        self.conv2 = ConvBnAct(inter_ch, inter_ch, ker_sz=5, stride=1, pad=2, act=act, group=inter_ch)
        self.conv3 = ConvBnAct(in_ch, in_ch, ker_sz=1, stride=1, pad=0, act=nn.Identity())

    def func(self, x):
        y1 = self.conv1(x)
        y2 = self.conv2(y1)
        y = torch.cat([y1, y2], dim=1)
        y = self.conv3(y)
        return y

    def forward(self, x1, x2):
        y = x1 + self.func(x2)
        return x2, y

    def invert(self, y1, y2):
        x2, y = y1, y2
        x1 = y - self.func(x2)
        return x1, x2


def new_model():
    act = nn.ELU()
    rvb = RevGroupBlock(128, 128, 1, act, RevBlockC, 12).to(device)
    rvb.eval()
    return rvb


if __name__ == '__main__':
    cudnn.benchmark = False
    cudnn.deterministic = True
    torch.set_grad_enabled(False)

    parse = argparse.ArgumentParser(description='Used to check pytorch speed benchmark.')
    parse.add_argument('-i', type=int, help='Card id. Which cuda card do you want to test. default: 0', default=0)
    parse.add_argument('-e', type=int, help='Epoch. defaule: 500', default=500)
    parse.add_argument('-bp', type=bool, help='Use backward. defaule: True', default=True)
    parse.add_argument('-bs', type=int, help='Batch size. defaule: 8', default=8)
    parse = parse.parse_args()

    card_id = parse.i
    epoch = parse.e
    use_backward = parse.bp
    batch_size = parse.bs

    # 使用cpu测试理论上是永远不会报错的
    device = 'cpu' if card_id == -1 else f'cuda:{card_id}'
    device = torch.device(device)
    assert epoch > 0
    assert batch_size > 0

    rvb = new_model()

    is_no_num_error = True

    torch.set_grad_enabled(use_backward)

    start_record = torch.cuda.Event(enable_timing=True)
    end_record = torch.cuda.Event(enable_timing=True)

    print('Speed benchmark begin.')
    start_time = time.perf_counter()
    start_record.record()
    for e in range(epoch):
        e = e+1

        a1 = torch.randn(batch_size, 128, 64, 64, device=device)
        b1, b2 = rvb(a1, a1)
        o_a1, o_a2 = rvb.invert(b1, b2)

        if use_backward:
            (o_a1.max() + o_a2.max()).backward()

        with torch.no_grad():
            max_diff_1 = torch.abs(o_a1 - o_a2).max().item()
            max_diff_2 = torch.abs(a1 - o_a1).max().item()

        # cur_time = time.perf_counter()
        # cost_time = cur_time-start_time
        # guess_full_cost_time = cost_time / e * epoch
        #
        # line = f'card_id: {card_id} elapsed/total: {e}/{epoch} time: {int(cost_time)}/{int(guess_full_cost_time)} md1: {max_diff_1:.8f} md2: {max_diff_2:.8f}'
        # print(line)

        if max_diff_1 > 1e-3 or max_diff_2 > 1e-3:
            print(f'A large numerical error was found! diff_1: {max_diff_1:.8f} diff_2: {max_diff_2:.8f}')
            is_no_num_error = False

    end_record.record()
    torch.cuda.synchronize()
    end_time = time.perf_counter()

    cuda_time = start_record.elapsed_time(end_record) / 1000
    perf_counter_time = end_time - start_time

    print('Speed benchmark finish.')

    result = {
        'cuda_time': cuda_time,
        'perf_counter_time': perf_counter_time,
        'no_num_error': is_no_num_error,
        'deterministic': cudnn.deterministic,
        'benchmark': cudnn.benchmark,
        'platform': platform.platform(),
        'machine': platform.machine(),
        'python_build': platform.python_build(),
#        'device': 'cpu' if device == torch.device('cpu') else torch.cuda.get_device_name(device),
#        'cuda_version': '' if device == torch.device('cpu') else torch.cuda_version,
        'test_time': datetime.datetime.now().isoformat(),
    }

    print('Result')
    for k, v in result.items():
        print(f'{k}: {v}')

参考资料：

A100 Tensor Float 32 性能实测

本文阅读量次
本站总访问量次