#include #include #include #define MAX 1048576 #define N 1024 int mat_1[MAX]; int mat_2[MAX]; int result[MAX]; __global__ void kernel(int* A, int* B, int* C) { int x = blockIdx.x * blockDim.x + threadIdx.x; if(x < MAX) { C[x] = A[x] + B[x]; } } int main(void) { int i = 0; unsigned int hTimer; int *A, *B, *C; for(i = 0; i < MAX; i++) { mat_1[i] = i; mat_2[i] = MAX-i; } cutCreateTimer(&hTimer); cudaThreadSynchronize(); cutResetTimer(hTimer); cutStartTimer(hTimer); unsigned int blockSize = 512; unsigned int nBlocks = (MAX+1)/blockSize; cudaMalloc((void **) &A, MAX * sizeof(int)); cudaMalloc((void **) &B, MAX * sizeof(int)); cudaMalloc((void **) &C, MAX * sizeof(int)); cudaMemcpy(A, mat_1, MAX * sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(B, mat_2, MAX * sizeof(int), cudaMemcpyHostToDevice); //cudaMemcpy(C, result, MAX * sizeof(int),cudaMemcpyHostToDevice); kernel<<>>(A, B, C); cudaMemcpy(result, C, MAX * sizeof(int), cudaMemcpyDeviceToHost); //for(i = 0; i < MAX; i++) //{ // printf("C[%d] = %f\n", i, result[i]); //} cudaThreadSynchronize(); cutStopTimer(hTimer); printf("GPU time: %f msecs.\n", cutGetTimerValue(hTimer)); cutCreateTimer(&hTimer); cudaThreadSynchronize(); cutResetTimer(hTimer); cutStartTimer(hTimer); for(i = 0; i < MAX; i++) { result[i] = mat_1[i] + mat_2[i]; } cudaThreadSynchronize(); cutStopTimer(hTimer); printf("GPU time: %f msecs.\n", cutGetTimerValue(hTimer)); return 0; }