1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197 | #include <stdlib.h>
#include <stdio.h>
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <helper_cuda.h>
#include <helper_functions.h>
#define MAX_GPU_COUNT 64
__global__ void SimpleKernel(float *src, float *dst)
{
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
dst[idx] = src[idx] * 2.0f;
}
inline bool IsGPUCapableP2P(cudaDeviceProp *pProp)
{
#ifdef _WIN32
return (bool)(pProp->tccDriver ? true : false);
#else
return (bool)(pProp->major >= 2);
#endif
}
int main(int argc, char **argv)
{
printf("\n\tStarting\n", argv[0]);
// 检查是否使用 64 位操作系统环境
if (sizeof(void*) != 8)
{
printf("\n\tError for program only supported with 64-bit OS and 64-bit target\n");
return EXIT_WAIVED;
}
// 找到头两块计算能力不小于 2.0 的设备
int gpu_n;
cudaGetDeviceCount(&gpu_n);
printf("\n\tDevice count: %d\n", gpu_n);
if (gpu_n < 2)
{
printf("\n\tError for two or more GPUs with SM2.0 required\n");
return EXIT_WAIVED;
}
cudaDeviceProp prop[MAX_GPU_COUNT];
int gpuid[MAX_GPU_COUNT], gpu_count = 0;
printf("\n\tShow device\n");// 展示所有设备
for (int i=0; i < gpu_n; i++)
{
cudaGetDeviceProperties(&prop[i], i);
if ((prop[i].major >= 2)
#ifdef _WIN32
&& prop[i].tccDriver// Windows 系统还要求有 Tesla 计算集群驱动
#endif
)
gpuid[gpu_count++] = i;
printf("\n\tGPU%d = \"%15s\" ---- %s\n", i, prop[i].name, (IsGPUCapableP2P(&prop[i]) ? "YES" : "NO"));
}
if (gpu_count < 2)
{
printf("\n\tError for two or more GPUs with SM2.0 required\n");
#ifdef _WIN32
printf("\nOr for TCC driver required\n");
#endif
cudaSetDevice(0);
return EXIT_WAIVED;
}
// 寻找测试设备
int can_access_peer, p2pCapableGPUs[2];
p2pCapableGPUs[0] = p2pCapableGPUs[1] = -1;
printf("\n\tShow combination of devices with P2P\n");// 展示所有能 P2P 的设备组合
for (int i = 0; i < gpu_count - 1; i++)
{
for (int j = i + 1; j < gpu_count; j++)
{
cudaDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]);
if (can_access_peer)
{
printf("\n\tGPU%d (%s) <--> GPU%d (%s) : %s\n", gpuid[i], prop[gpuid[i]].name, gpuid[j], prop[gpuid[j]].name);
if (p2pCapableGPUs[0] == -1)
p2pCapableGPUs[0] = gpuid[i], p2pCapableGPUs[1] = gpuid[j];
}
}
}
if (p2pCapableGPUs[0] == -1 || p2pCapableGPUs[1] == -1)
{
printf("\n\tError for P2P not available among GPUs\n");
for (int i=0; i < gpu_count; i++)
cudaSetDevice(gpuid[i]);
return EXIT_WAIVED;
}
// 使用找到的设备进行测试
gpuid[0] = p2pCapableGPUs[0];
gpuid[1] = p2pCapableGPUs[1];
printf("\n\tEnabling P2P between GPU%d and GPU%d\n", gpuid[0], gpuid[1]);
// 启用 P2P
cudaSetDevice(gpuid[0]);
cudaDeviceEnablePeerAccess(gpuid[1], 0);
cudaSetDevice(gpuid[1]);
cudaDeviceEnablePeerAccess(gpuid[0], 0);
// 检查设备是否支持同一可视地址空间 (Unified Virtual Address Space,UVA)
if (!(prop[gpuid[0]].unifiedAddressing && prop[gpuid[1]].unifiedAddressing))
printf("\n\tError for GPU not support UVA\n");
return EXIT_WAIVED;
// 申请内存
const size_t buf_size = 1024 * 1024 * 16 * sizeof(float);
printf("\n\tAllocating buffers %iMB\n", int(buf_size / 1024 / 1024));
cudaSetDevice(gpuid[0]);
float *g0;
cudaMalloc(&g0, buf_size);
cudaSetDevice(gpuid[1]);
float *g1;
cudaMalloc(&g1, buf_size);
float *h0;
cudaMallocHost(&h0, buf_size);
cudaEvent_t start_event, stop_event;
int eventflags = cudaEventBlockingSync;
float time_memcpy;
cudaEventCreateWithFlags(&start_event, eventflags);
cudaEventCreateWithFlags(&stop_event, eventflags);
cudaEventRecord(start_event, 0);
for (int i=0; i<100; i++)
{
// GPU 互拷
// UVA 特性下 cudaMemcpyDefault 直接根据指针(属于主机还是设备)来确定拷贝方向
if (i % 2 == 0)
cudaMemcpy(g1, g0, buf_size, cudaMemcpyDefault);
else
cudaMemcpy(g0, g1, buf_size, cudaMemcpyDefault);
}
cudaEventRecord(stop_event, 0);
cudaEventSynchronize(stop_event);
cudaEventElapsedTime(&time_memcpy, start_event, stop_event);
printf("\n\tcudaMemcpy: %.2fGB/s\n", (100.0f * buf_size) / (1024.0f * 1024.0f * 1024.0f * (time_memcpy / 1000.0f)));
for (int i=0; i<buf_size / sizeof(float); i++)
h0[i] = float(i % 4096);
cudaSetDevice(gpuid[0]);
cudaMemcpy(g0, h0, buf_size, cudaMemcpyDefault);
const dim3 threads(512, 1);
const dim3 blocks((buf_size / sizeof(float)) / threads.x, 1);
// 使用 GPU1 读取 GPU0 的全局内存数据,计算并写入 GPU1 的全局内存
printf("\n\tRun kernel on GPU%d, reading data from GPU%d and writing to GPU%d\n", gpuid[1], gpuid[0], gpuid[1]);
cudaSetDevice(gpuid[1]);
SimpleKernel<<<blocks, threads>>>(g0, g1);
cudaDeviceSynchronize();
// 使用 GPU0 读取 GPU1 的全局内存数据,计算并写入 GPU0 的全局内存
printf("\n\tRun kernel on GPU%d, reading data from GPU%d and writing to GPU%d\n", gpuid[0], gpuid[1], gpuid[0]);
cudaSetDevice(gpuid[0]);
SimpleKernel<<<blocks, threads>>>(g1, g0);
cudaDeviceSynchronize();
// 检查结果
cudaMemcpy(h0, g0, buf_size, cudaMemcpyDefault);
int error_count = 0;
for (int i=0; i<buf_size / sizeof(float); i++)
{
if (h0[i] != float(i % 4096) * 2.0f * 2.0f)
{
printf("\n\tResult error at %i: gpu[i] = %f, cpu[i] = %f\n", i, h0[i], (float(i%4096)*2.0f*2.0f));
if (error_count++ > 10)
break;
}
}
// 关闭 P2P
cudaSetDevice(gpuid[0]);
cudaDeviceDisablePeerAccess(gpuid[1]);
cudaSetDevice(gpuid[1]);
cudaDeviceDisablePeerAccess(gpuid[0]);
// 回收工作
cudaFreeHost(h0);
cudaSetDevice(gpuid[0]);
cudaFree(g0);
cudaSetDevice(gpuid[1]);
cudaFree(g1);
cudaEventDestroy(start_event);
cudaEventDestroy(stop_event);
for (int i=0; i<gpu_n; i++)
cudaSetDevice(i);
printf("\n\t%s!\n",error_count?"Test failed": "Test passed");
getchar();
return 0;
}
|