用CUDA进行一维数组的矢量求和

基于GPU的一维数组的矢量求和

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#include <iostream>

#include <cstdio>


#define N 20
//如果忘记写global,会报错:error: a host function call cannot be configured
__global__ void add(int *a, int *b, int *c){
int tid = blockIdx.x;
if(tid< N){
c[tid] = a[tid] + b[tid];
}
}


int main(){

int a[N];
int b[N];
int c[N];
int *dev_a, *dev_b, *dev_c;

/*cudaMalloc((void**)&dev_a, N*sizeof(int));*/
/*cudaMalloc((void**)&dev_b, N*sizeof(int));*/
/*cudaMalloc((void**)&dev_c, N*sizeof(int));*/

cudaMalloc(&dev_a, N*sizeof(int));
cudaMalloc(&dev_b, N*sizeof(int));
cudaMalloc(&dev_c, N*sizeof(int));

for(int i =0 ; i<N; i++){
a[i] = i;
b[i] = i;
}

cudaMemcpy(dev_a, a, N*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, N*sizeof(int), cudaMemcpyHostToDevice);

add<<<N,1>>>(dev_a,dev_b,dev_c);

cudaMemcpy(c, dev_c, N*sizeof(int), cudaMemcpyDeviceToHost);

cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);

for(int i =0 ;i<N; i++){
//如果错误的访问量dev_c[i],会报告段错误
std::cout<<c[i]<<std::endl;
}
return 0;

}