0_Simple__simpleP2P.cu解析

"cuda"
"simple"
"simplep2p"
"nvidia"

simpleP2P.cu 使用 P2P 特性在 GPU 之间传输、读写数据。源代码。包括 P2P 使用前的各项检查，设备之间的数据互拷，主机和设备之间数据传输和相互访问。

以下是使用 ipmitool 配置 BMC 的一些常见操作：

1. 源代码¶

Bash
#include <stdlib.h>
#include <stdio.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <helper_cuda.h>
#include <helper_functions.h>

#define MAX_GPU_COUNT 64

__global__ void SimpleKernel(float *src, float *dst)
{
    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
    dst[idx] = src[idx] * 2.0f;
}

inline bool IsGPUCapableP2P(cudaDeviceProp *pProp)
{
#ifdef _WIN32
    return (bool)(pProp->tccDriver ? true : false);
#else
    return (bool)(pProp->major >= 2);
#endif
}

int main(int argc, char **argv)
{
    printf("\n\tStarting\n", argv[0]);

    // 检查是否使用 64 位操作系统环境
    if (sizeof(void*) != 8)
    {
        printf("\n\tError for program only supported with 64-bit OS and 64-bit target\n");
        return EXIT_WAIVED;
    }

    // 找到头两块计算能力不小于 2.0 的设备
    int gpu_n;
    cudaGetDeviceCount(&gpu_n);
    printf("\n\tDevice count: %d\n", gpu_n);
    if (gpu_n < 2)
    {
        printf("\n\tError for two or more GPUs with SM2.0 required\n");
        return EXIT_WAIVED;
    }

    cudaDeviceProp prop[MAX_GPU_COUNT];
    int gpuid[MAX_GPU_COUNT], gpu_count = 0;
    printf("\n\tShow device\n");// 展示所有设备
    for (int i=0; i < gpu_n; i++)
    {
        cudaGetDeviceProperties(&prop[i], i);
        if ((prop[i].major >= 2)
#ifdef _WIN32
            && prop[i].tccDriver// Windows 系统还要求有 Tesla 计算集群驱动
#endif
           )
            gpuid[gpu_count++] = i;
        printf("\n\tGPU%d = \"%15s\" ---- %s\n", i, prop[i].name, (IsGPUCapableP2P(&prop[i]) ? "YES" : "NO"));
    }
    if (gpu_count < 2)
    {
        printf("\n\tError for two or more GPUs with SM2.0 required\n");
#ifdef _WIN32
        printf("\nOr for TCC driver required\n");
#endif
        cudaSetDevice(0);
        return EXIT_WAIVED;
    }

    // 寻找测试设备
    int can_access_peer, p2pCapableGPUs[2];
    p2pCapableGPUs[0] = p2pCapableGPUs[1] = -1;
    printf("\n\tShow combination of devices with P2P\n");// 展示所有能 P2P 的设备组合
    for (int i = 0; i < gpu_count - 1; i++)
    {
        for (int j = i + 1; j < gpu_count; j++)
        {
            cudaDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]);
            if (can_access_peer)
            {
                printf("\n\tGPU%d (%s) <--> GPU%d (%s) : %s\n", gpuid[i], prop[gpuid[i]].name, gpuid[j], prop[gpuid[j]].name);
                if (p2pCapableGPUs[0] == -1)
                    p2pCapableGPUs[0] = gpuid[i], p2pCapableGPUs[1] = gpuid[j];
            }
        }
    }
    if (p2pCapableGPUs[0] == -1 || p2pCapableGPUs[1] == -1)
    {
        printf("\n\tError for P2P not available among GPUs\n");
        for (int i=0; i < gpu_count; i++)
            cudaSetDevice(gpuid[i]);
        return EXIT_WAIVED;
    }

    // 使用找到的设备进行测试
    gpuid[0] = p2pCapableGPUs[0];
    gpuid[1] = p2pCapableGPUs[1];
    printf("\n\tEnabling P2P between GPU%d and GPU%d\n", gpuid[0], gpuid[1]);

    // 启用 P2P
    cudaSetDevice(gpuid[0]);
    cudaDeviceEnablePeerAccess(gpuid[1], 0);
    cudaSetDevice(gpuid[1]);
    cudaDeviceEnablePeerAccess(gpuid[0], 0);

    // 检查设备是否支持同一可视地址空间 (Unified Virtual Address Space，UVA)
    if (!(prop[gpuid[0]].unifiedAddressing && prop[gpuid[1]].unifiedAddressing))
        printf("\n\tError for GPU not support UVA\n");
        return EXIT_WAIVED;

    // 申请内存
    const size_t buf_size = 1024 * 1024 * 16 * sizeof(float);
    printf("\n\tAllocating buffers %iMB\n", int(buf_size / 1024 / 1024));
    cudaSetDevice(gpuid[0]);
    float *g0;
    cudaMalloc(&g0, buf_size);
    cudaSetDevice(gpuid[1]);
    float *g1;
    cudaMalloc(&g1, buf_size);
    float *h0;
    cudaMallocHost(&h0, buf_size);

    cudaEvent_t start_event, stop_event;
    int eventflags = cudaEventBlockingSync;
    float time_memcpy;
    cudaEventCreateWithFlags(&start_event, eventflags);
    cudaEventCreateWithFlags(&stop_event, eventflags);
    cudaEventRecord(start_event, 0);

    for (int i=0; i<100; i++)
    {
        // GPU 互拷
        // UVA 特性下 cudaMemcpyDefault 直接根据指针（属于主机还是设备）来确定拷贝方向
        if (i % 2 == 0)
            cudaMemcpy(g1, g0, buf_size, cudaMemcpyDefault);
        else
            cudaMemcpy(g0, g1, buf_size, cudaMemcpyDefault);
    }
    cudaEventRecord(stop_event, 0);
    cudaEventSynchronize(stop_event);
    cudaEventElapsedTime(&time_memcpy, start_event, stop_event);
    printf("\n\tcudaMemcpy: %.2fGB/s\n", (100.0f * buf_size) / (1024.0f * 1024.0f * 1024.0f * (time_memcpy / 1000.0f)));

    for (int i=0; i<buf_size / sizeof(float); i++)
        h0[i] = float(i % 4096);
    cudaSetDevice(gpuid[0]);
    cudaMemcpy(g0, h0, buf_size, cudaMemcpyDefault);

    const dim3 threads(512, 1);
    const dim3 blocks((buf_size / sizeof(float)) / threads.x, 1);

    // 使用 GPU1 读取 GPU0 的全局内存数据，计算并写入 GPU1 的全局内存
    printf("\n\tRun kernel on GPU%d, reading data from GPU%d and writing to GPU%d\n", gpuid[1], gpuid[0], gpuid[1]);
    cudaSetDevice(gpuid[1]);
    SimpleKernel<<<blocks, threads>>>(g0, g1);
    cudaDeviceSynchronize();

    // 使用 GPU0 读取 GPU1 的全局内存数据，计算并写入 GPU0 的全局内存
    printf("\n\tRun kernel on GPU%d, reading data from GPU%d and writing to GPU%d\n", gpuid[0], gpuid[1], gpuid[0]);
    cudaSetDevice(gpuid[0]);
    SimpleKernel<<<blocks, threads>>>(g1, g0);
    cudaDeviceSynchronize();

    // 检查结果
    cudaMemcpy(h0, g0, buf_size, cudaMemcpyDefault);
    int error_count = 0;
    for (int i=0; i<buf_size / sizeof(float); i++)
    {
        if (h0[i] != float(i % 4096) * 2.0f * 2.0f)
        {
            printf("\n\tResult error at %i: gpu[i] = %f, cpu[i] = %f\n", i, h0[i], (float(i%4096)*2.0f*2.0f));
            if (error_count++ > 10)
                break;
        }
    }

    // 关闭 P2P
    cudaSetDevice(gpuid[0]);
    cudaDeviceDisablePeerAccess(gpuid[1]);
    cudaSetDevice(gpuid[1]);
    cudaDeviceDisablePeerAccess(gpuid[0]);

    // 回收工作
    cudaFreeHost(h0);
    cudaSetDevice(gpuid[0]);
    cudaFree(g0);
    cudaSetDevice(gpuid[1]);
    cudaFree(g1);
    cudaEventDestroy(start_event);
    cudaEventDestroy(stop_event);
    for (int i=0; i<gpu_n; i++)
        cudaSetDevice(i);
    printf("\n\t%s!\n",error_count?"Test failed": "Test passed");

    getchar();
    return 0;
}

2. 重要点¶

P2P 要求：至少两台计算能力不低于 2.0 的设备，并支持同一可视内存空间特性；计算环境不低于 CUDA 4.0；Windows 安装 Tesla 计算集群驱动。

使用P2P的关键步骤¶

Bash
// 检查两台设备之间是否能使用 P2P
int can_access_peer;
cudaDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]));

// 启用 P2P
cudaSetDevice(gpuid[i]);
cudaDeviceEnablePeerAccess(gpuid[j], 0);
cudaSetDevice(gpuid[j];
cudaDeviceEnablePeerAccess(gpuid[i], 0);

// 设备间传输数据
cudaMemcpy(g1, g0, buf_size, cudaMemcpyDefault);

// 关闭 P2P
cudaSetDevice(gpuid[i]);
cudaDeviceDisablePeerAccess(gpuid[i]);
cudaSetDevice(gpuid[j]);
cudaDeviceDisablePeerAccess(gpuid[j]);

// cuda_runtime_api.h
extern __host__ cudaError_t CUDARTAPI cudaDeviceCanAccessPeer(int *canAccessPeer, int device, int peerDevice);

extern __host__ cudaError_t CUDARTAPI cudaDeviceEnablePeerAccess(int peerDevice, unsigned int flags);

extern __host__ cudaError_t CUDARTAPI cudaDeviceDisablePeerAccess(int peerDevice);

3. 官网api参数含义¶

CUDA Runtime API :: CUDA Toolkit Documentation