#include #include #include #include #include #include #include "timer.h" int main( int argc, char* argv[] ) { int n; /* size of the vector */ float *restrict a; /* the vector */ float *restrict r; /* the results */ float *restrict e; /* expected results */ float s, c; struct timeval t1, t2, t3; double cgpu, chost; int i; if( argc > 1 ) n = atoi( argv[1] ); else n = 100000; if( n <= 0 ) n = 100000; a = (float*)malloc(n*sizeof(float)); r = (float*)malloc(n*sizeof(float)); e = (float*)malloc(n*sizeof(float)); for( i = 0; i < n; ++i ) a[i] = (float)(i+1) * 2.0f; /*acc_init( acc_device_nvidia );*/ StartTimer(); #pragma data copyin(a[0:n]), copyout(r) { #pragma acc kernels #pragma acc loop gang, vector(128) for( i = 0; i < n; ++i ){ s = sinf(a[i]); c = cosf(a[i]); r[i] = s*s + c*c; } } cgpu = GetTimer(); StartTimer(); for( i = 0; i < n; ++i ){ s = sinf(a[i]); c = cosf(a[i]); e[i] = s*s + c*c; } chost = GetTimer(); /* check the results */ for( i = 0; i < n; ++i ) assert( fabsf(r[i] - e[i]) < 0.000001f ); printf( "%13d iterations completed\n", n ); printf( "%13g microseconds on GPU\n", cgpu*1000 ); printf( "%13g microseconds on host\n", chost*1000 ); return 0; }