#define N 10 __global__ void VecAdd( int *a, int *b, int *c ) { int tid = blockIdx.x; // this thread handles the data at its thread id if (tid < N) c[tid] = a[tid] + b[tid]; } int main( void ) { int a[N], b[N], c[N]; int *dev_a, *dev_b, *dev_c; // allocate the memory on the GPU cudaMalloc( (void**)&dev_a, N * sizeof(int) ); cudaMalloc( (void**)&dev_b, N * sizeof(int) ); cudaMalloc( (void**)&dev_c, N * sizeof(int) ); // fill the arrays 'a' and 'b' on the CPU for (int i=0; i>>( dev_a, dev_b, dev_c ); // copy the array 'c' back from the GPU to the CPU cudaMemcpy( c, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost ); // display the results for (int i=0; i