跳转至

0_Simple__simpleP2P.cu解析

  • "cuda"
  • "simple"
  • "simplep2p"
  • "nvidia"

simpleP2P.cu 使用 P2P 特性在 GPU 之间传输、读写数据。 源代码。包括 P2P 使用前的各项检查,设备之间的数据互拷,主机和设备之间数据传输和相互访问。

以下是使用 ipmitool 配置 BMC 的一些常见操作:

注意github上simpleP2P源代码

1. 源代码

Bash
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
#include <stdlib.h>
#include <stdio.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <helper_cuda.h>
#include <helper_functions.h>

#define MAX_GPU_COUNT 64

__global__ void SimpleKernel(float *src, float *dst)
{
    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
    dst[idx] = src[idx] * 2.0f;
}

inline bool IsGPUCapableP2P(cudaDeviceProp *pProp)
{
#ifdef _WIN32
    return (bool)(pProp->tccDriver ? true : false);
#else
    return (bool)(pProp->major >= 2);
#endif
}

int main(int argc, char **argv)
{
    printf("\n\tStarting\n", argv[0]);

    // 检查是否使用 64 位操作系统环境
    if (sizeof(void*) != 8)
    {
        printf("\n\tError for program only supported with 64-bit OS and 64-bit target\n");
        return EXIT_WAIVED;
    }

    // 找到头两块计算能力不小于 2.0 的设备
    int gpu_n;
    cudaGetDeviceCount(&gpu_n);
    printf("\n\tDevice count: %d\n", gpu_n);
    if (gpu_n < 2)
    {
        printf("\n\tError for two or more GPUs with SM2.0 required\n");
        return EXIT_WAIVED;
    }

    cudaDeviceProp prop[MAX_GPU_COUNT];
    int gpuid[MAX_GPU_COUNT], gpu_count = 0;
    printf("\n\tShow device\n");// 展示所有设备
    for (int i=0; i < gpu_n; i++)
    {
        cudaGetDeviceProperties(&prop[i], i);
        if ((prop[i].major >= 2)
#ifdef _WIN32
            && prop[i].tccDriver// Windows 系统还要求有 Tesla 计算集群驱动
#endif
           )
            gpuid[gpu_count++] = i;
        printf("\n\tGPU%d = \"%15s\" ---- %s\n", i, prop[i].name, (IsGPUCapableP2P(&prop[i]) ? "YES" : "NO"));
    }
    if (gpu_count < 2)
    {
        printf("\n\tError for two or more GPUs with SM2.0 required\n");
#ifdef _WIN32
        printf("\nOr for TCC driver required\n");
#endif
        cudaSetDevice(0);
        return EXIT_WAIVED;
    }

    // 寻找测试设备
    int can_access_peer, p2pCapableGPUs[2];
    p2pCapableGPUs[0] = p2pCapableGPUs[1] = -1;
    printf("\n\tShow combination of devices with P2P\n");// 展示所有能 P2P 的设备组合
    for (int i = 0; i < gpu_count - 1; i++)
    {
        for (int j = i + 1; j < gpu_count; j++)
        {
            cudaDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]);
            if (can_access_peer)
            {
                printf("\n\tGPU%d (%s) <--> GPU%d (%s) : %s\n", gpuid[i], prop[gpuid[i]].name, gpuid[j], prop[gpuid[j]].name);
                if (p2pCapableGPUs[0] == -1)
                    p2pCapableGPUs[0] = gpuid[i], p2pCapableGPUs[1] = gpuid[j];
            }
        }
    }
    if (p2pCapableGPUs[0] == -1 || p2pCapableGPUs[1] == -1)
    {
        printf("\n\tError for P2P not available among GPUs\n");
        for (int i=0; i < gpu_count; i++)
            cudaSetDevice(gpuid[i]);
        return EXIT_WAIVED;
    }

    // 使用找到的设备进行测试
    gpuid[0] = p2pCapableGPUs[0];
    gpuid[1] = p2pCapableGPUs[1];
    printf("\n\tEnabling P2P between GPU%d and GPU%d\n", gpuid[0], gpuid[1]);

    // 启用 P2P
    cudaSetDevice(gpuid[0]);
    cudaDeviceEnablePeerAccess(gpuid[1], 0);
    cudaSetDevice(gpuid[1]);
    cudaDeviceEnablePeerAccess(gpuid[0], 0);

    // 检查设备是否支持同一可视地址空间 (Unified Virtual Address Space,UVA)
    if (!(prop[gpuid[0]].unifiedAddressing && prop[gpuid[1]].unifiedAddressing))
        printf("\n\tError for GPU not support UVA\n");
        return EXIT_WAIVED;

    // 申请内存
    const size_t buf_size = 1024 * 1024 * 16 * sizeof(float);
    printf("\n\tAllocating buffers %iMB\n", int(buf_size / 1024 / 1024));
    cudaSetDevice(gpuid[0]);
    float *g0;
    cudaMalloc(&g0, buf_size);
    cudaSetDevice(gpuid[1]);
    float *g1;
    cudaMalloc(&g1, buf_size);
    float *h0;
    cudaMallocHost(&h0, buf_size);

    cudaEvent_t start_event, stop_event;
    int eventflags = cudaEventBlockingSync;
    float time_memcpy;
    cudaEventCreateWithFlags(&start_event, eventflags);
    cudaEventCreateWithFlags(&stop_event, eventflags);
    cudaEventRecord(start_event, 0);

    for (int i=0; i<100; i++)
    {
        // GPU 互拷
        // UVA 特性下 cudaMemcpyDefault 直接根据指针(属于主机还是设备)来确定拷贝方向
        if (i % 2 == 0)
            cudaMemcpy(g1, g0, buf_size, cudaMemcpyDefault);
        else
            cudaMemcpy(g0, g1, buf_size, cudaMemcpyDefault);
    }
    cudaEventRecord(stop_event, 0);
    cudaEventSynchronize(stop_event);
    cudaEventElapsedTime(&time_memcpy, start_event, stop_event);
    printf("\n\tcudaMemcpy: %.2fGB/s\n", (100.0f * buf_size) / (1024.0f * 1024.0f * 1024.0f * (time_memcpy / 1000.0f)));

    for (int i=0; i<buf_size / sizeof(float); i++)
        h0[i] = float(i % 4096);
    cudaSetDevice(gpuid[0]);
    cudaMemcpy(g0, h0, buf_size, cudaMemcpyDefault);

    const dim3 threads(512, 1);
    const dim3 blocks((buf_size / sizeof(float)) / threads.x, 1);

    // 使用 GPU1 读取 GPU0 的全局内存数据,计算并写入 GPU1 的全局内存
    printf("\n\tRun kernel on GPU%d, reading data from GPU%d and writing to GPU%d\n", gpuid[1], gpuid[0], gpuid[1]);
    cudaSetDevice(gpuid[1]);
    SimpleKernel<<<blocks, threads>>>(g0, g1);
    cudaDeviceSynchronize();

    // 使用 GPU0 读取 GPU1 的全局内存数据,计算并写入 GPU0 的全局内存
    printf("\n\tRun kernel on GPU%d, reading data from GPU%d and writing to GPU%d\n", gpuid[0], gpuid[1], gpuid[0]);
    cudaSetDevice(gpuid[0]);
    SimpleKernel<<<blocks, threads>>>(g1, g0);
    cudaDeviceSynchronize();

    // 检查结果
    cudaMemcpy(h0, g0, buf_size, cudaMemcpyDefault);
    int error_count = 0;
    for (int i=0; i<buf_size / sizeof(float); i++)
    {
        if (h0[i] != float(i % 4096) * 2.0f * 2.0f)
        {
            printf("\n\tResult error at %i: gpu[i] = %f, cpu[i] = %f\n", i, h0[i], (float(i%4096)*2.0f*2.0f));
            if (error_count++ > 10)
                break;
        }
    }

    // 关闭 P2P
    cudaSetDevice(gpuid[0]);
    cudaDeviceDisablePeerAccess(gpuid[1]);
    cudaSetDevice(gpuid[1]);
    cudaDeviceDisablePeerAccess(gpuid[0]);

    // 回收工作
    cudaFreeHost(h0);
    cudaSetDevice(gpuid[0]);
    cudaFree(g0);
    cudaSetDevice(gpuid[1]);
    cudaFree(g1);
    cudaEventDestroy(start_event);
    cudaEventDestroy(stop_event);
    for (int i=0; i<gpu_n; i++)
        cudaSetDevice(i);
    printf("\n\t%s!\n",error_count?"Test failed": "Test passed");

    getchar();
    return 0;
}

2. 重要点

P2P 要求:至少两台计算能力不低于 2.0 的设备,并支持同一可视内存空间特性;计算环境不低于 CUDA 4.0;Windows 安装 Tesla 计算集群驱动。

使用P2P的关键步骤

Bash
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
// 检查两台设备之间是否能使用 P2P
int can_access_peer;
cudaDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]));

// 启用 P2P
cudaSetDevice(gpuid[i]);
cudaDeviceEnablePeerAccess(gpuid[j], 0);
cudaSetDevice(gpuid[j];
cudaDeviceEnablePeerAccess(gpuid[i], 0);

// 设备间传输数据
cudaMemcpy(g1, g0, buf_size, cudaMemcpyDefault);

// 关闭 P2P
cudaSetDevice(gpuid[i]);
cudaDeviceDisablePeerAccess(gpuid[i]);
cudaSetDevice(gpuid[j]);
cudaDeviceDisablePeerAccess(gpuid[j]);

// cuda_runtime_api.h
extern __host__ cudaError_t CUDARTAPI cudaDeviceCanAccessPeer(int *canAccessPeer, int device, int peerDevice);

extern __host__ cudaError_t CUDARTAPI cudaDeviceEnablePeerAccess(int peerDevice, unsigned int flags);

extern __host__ cudaError_t CUDARTAPI cudaDeviceDisablePeerAccess(int peerDevice);

3. 官网api参数含义

CUDA Runtime API :: CUDA Toolkit Documentation