__global__ void vecadd_kernel(int n, float *a, float *b, float *c) { int i = blockDim.x * blockIdx.x + threadIdx.x; if ( i < n ) c[i] = a[i] + b[i]; } extern "C" void vecadd(int n ,float *a, float *b, float *c) { dim3 griddim, blockdim; blockdim = dim3(128,1,1); griddim = dim3(n/blockdim.x,1,1); vecadd_kernel<<>>(n,a,b,c); }