16# define WINDOWS_LEAN_AND_MEAN
32#define cutilSafeCallNoSync(err) __cudaSafeCallNoSync(err, __FILE__, __LINE__)
33#define cutilSafeCall(err) __cudaSafeCall (err, __FILE__, __LINE__)
34#define cutilSafeThreadSync() __cudaSafeThreadSync(__FILE__, __LINE__)
35#define cufftSafeCall(err) __cufftSafeCall (err, __FILE__, __LINE__)
36#define cutilCheckError(err) __cutilCheckError (err, __FILE__, __LINE__)
37#define cutilCheckMsg(msg) __cutilGetLastError (msg, __FILE__, __LINE__)
38#define cutilCheckMsgAndSync(msg) __cutilGetLastErrorAndSync (msg, __FILE__, __LINE__)
39#define cutilSafeMalloc(mallocCall) __cutilSafeMalloc ((mallocCall), __FILE__, __LINE__)
40#define cutilCondition(val) __cutilCondition (val, __FILE__, __LINE__)
41#define cutilExit(argc, argv) __cutilExit (argc, argv)
43inline cudaError cutilDeviceSynchronize()
45 return cudaDeviceSynchronize();
48inline cudaError cutilDeviceReset()
50 return cudaDeviceReset();
53inline void __cutilCondition(
int val,
char *file,
int line)
55 if( CUTFalse == cutCheckCondition( val, file, line ) ) {
60inline void __cutilExit(
int argc,
char **argv)
62 if (!cutCheckCmdLineFlag(argc, (
const char**)argv,
"noprompt")) {
63 printf(
"\nPress ENTER to exit...\n");
71#define MIN(a,b) ((a < b) ? a : b)
72#define MAX(a,b) ((a > b) ? a : b)
75inline int _ConvertSMVer2Cores(
int major,
int minor)
83 sSMtoCores nGpuArchCoresPerSM[] =
94 while (nGpuArchCoresPerSM[index].SM != -1) {
95 if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) {
96 return nGpuArchCoresPerSM[index].Cores;
100 printf(
"MapSMtoCores undefined SMversion %d.%d!\n", major, minor);
106inline int cutGetMaxGflopsDeviceId()
108 int current_device = 0;
109 int max_compute_perf = 0;
110 int max_perf_device = 0;
111 int device_count = 0;
112 int best_SM_arch = 0;
115 cudaGetDeviceCount( &device_count );
117 while ( current_device < device_count ) {
118 cudaDeviceProp deviceProp;
119 cudaGetDeviceProperties( &deviceProp, current_device );
120 if (deviceProp.major > 0 && deviceProp.major < 9999) {
121 best_SM_arch = MAX(best_SM_arch, deviceProp.major);
128 while( current_device < device_count ) {
129 cudaDeviceProp deviceProp;
130 cudaGetDeviceProperties( &deviceProp, current_device );
131 int sm_per_multiproc = (deviceProp.major == 9999 && deviceProp.minor == 9999) ? 1 : _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor);
133 cudaDeviceGetAttribute(&clock_rate, cudaDevAttrClockRate, current_device);
134 int compute_perf = deviceProp.multiProcessorCount * sm_per_multiproc * clock_rate;
135 if( compute_perf > max_compute_perf ) {
137 if ( best_SM_arch > 2 ) {
139 if (deviceProp.major == best_SM_arch) {
140 max_compute_perf = compute_perf;
141 max_perf_device = current_device;
144 max_compute_perf = compute_perf;
145 max_perf_device = current_device;
150 return max_perf_device;
154inline int cutGetMaxGflopsGraphicsDeviceId()
156 int current_device = 0;
157 int max_compute_perf = 0;
158 int max_perf_device = 0;
159 int device_count = 0;
160 int best_SM_arch = 0;
164 cudaGetDeviceCount( &device_count );
166 while ( current_device < device_count ) {
167 cudaDeviceProp deviceProp;
168 cudaGetDeviceProperties( &deviceProp, current_device );
170 if (deviceProp.tccDriver) bTCC = 1;
173 if (deviceProp.major > 0 && deviceProp.major < 9999) {
174 best_SM_arch = MAX(best_SM_arch, deviceProp.major);
182 while( current_device < device_count ) {
183 cudaDeviceProp deviceProp;
184 cudaGetDeviceProperties( &deviceProp, current_device );
185 int sm_per_multiproc = (deviceProp.major == 9999 && deviceProp.minor == 9999) ? 1 : _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor);
187 if (deviceProp.tccDriver) bTCC = 1;
191 cudaDeviceGetAttribute(&clock_rate, cudaDevAttrClockRate, current_device);
192 int compute_perf = deviceProp.multiProcessorCount * sm_per_multiproc * clock_rate;
193 if( compute_perf > max_compute_perf ) {
195 if ( best_SM_arch > 2 ) {
197 if (deviceProp.major == best_SM_arch) {
198 max_compute_perf = compute_perf;
199 max_perf_device = current_device;
203 max_compute_perf = compute_perf;
204 max_perf_device = current_device;
210 return max_perf_device;
217 inline void VSPrintf(FILE *file, LPCSTR fmt, ...)
219 std::size_t fmt2_sz = 2048;
220 char *fmt2 = (
char*)malloc(fmt2_sz);
222 va_start(vlist, fmt);
223 while((_vsnprintf(fmt2, fmt2_sz, fmt, vlist)) < 0)
227 fmt2 = (
char*)malloc(fmt2_sz);
229 OutputDebugStringA(fmt2);
233#define FPRINTF(a) VSPrintf a
235#define FPRINTF(a) fprintf a
240#define FPRINTF(a) fprintf a
243#define FPRINTF(a) fprintf a
249inline void __cudaSafeCallNoSync( cudaError err,
const char *file,
const int line )
251 if( cudaSuccess != err) {
252 FPRINTF((stderr,
"%s(%i) : cudaSafeCallNoSync() Runtime API error : %s.\n",
253 file, line, cudaGetErrorString( err) ));
258inline void __cudaSafeCall( cudaError err,
const char *file,
const int line )
260 if( cudaSuccess != err) {
261 FPRINTF((stderr,
"%s(%i) : cudaSafeCall() Runtime API error : %s.\n",
262 file, line, cudaGetErrorString( err) ));
267inline void __cudaSafeThreadSync(
const char *file,
const int line )
269 cudaError err = cutilDeviceSynchronize();
270 if ( cudaSuccess != err) {
271 FPRINTF((stderr,
"%s(%i) : cudaDeviceSynchronize() Runtime API error : %s.\n",
272 file, line, cudaGetErrorString( err) ));
277inline void __cufftSafeCall( cufftResult err,
const char *file,
const int line )
279 if( CUFFT_SUCCESS != err) {
280 FPRINTF((stderr,
"%s(%i) : cufftSafeCall() CUFFT error.\n",
286inline void __cutilCheckError( CUTBoolean err,
const char *file,
const int line )
288 if( CUTTrue != err) {
289 FPRINTF((stderr,
"%s(%i) : CUTIL CUDA error.\n",
295inline void __cutilGetLastError(
const char *errorMessage,
const char *file,
const int line )
297 cudaError_t err = cudaGetLastError();
298 if( cudaSuccess != err) {
299 FPRINTF((stderr,
"%s(%i) : cutilCheckMsg() CUTIL CUDA error : %s : %s.\n",
300 file, line, errorMessage, cudaGetErrorString( err) ));
305inline void __cutilGetLastErrorAndSync(
const char *errorMessage,
const char *file,
const int line )
307 cudaError_t err = cudaGetLastError();
308 if( cudaSuccess != err) {
309 FPRINTF((stderr,
"%s(%i) : cutilCheckMsg() CUTIL CUDA error : %s : %s.\n",
310 file, line, errorMessage, cudaGetErrorString( err) ));
314 err = cutilDeviceSynchronize();
315 if( cudaSuccess != err) {
316 FPRINTF((stderr,
"%s(%i) : cutilCheckMsg cudaDeviceSynchronize error: %s : %s.\n",
317 file, line, errorMessage, cudaGetErrorString( err) ));
322inline void __cutilSafeMalloc(
void *pointer,
const char *file,
const int line )
325 FPRINTF((stderr,
"%s(%i) : cutilSafeMalloc host malloc failure\n",
331#if __DEVICE_EMULATION__
332 inline int cutilDeviceInit(
int ARGC,
char **ARGV) { }
333 inline int cutilChooseCudaDevice(
int ARGC,
char **ARGV) { }
335 inline int cutilDeviceInit(
int ARGC,
char **ARGV)
338 cutilSafeCallNoSync(cudaGetDeviceCount(&deviceCount));
339 if (deviceCount == 0) {
340 FPRINTF((stderr,
"CUTIL CUDA error: no devices supporting CUDA.\n"));
344 cutGetCmdLineArgumenti(ARGC, (
const char **) ARGV,
"device", &dev);
347 if (dev > deviceCount-1) {
348 fprintf(stderr,
"\n");
349 fprintf(stderr,
">> %d CUDA capable GPU device(s) detected. <<\n", deviceCount);
350 fprintf(stderr,
">> cutilDeviceInit (-device=%d) is not a valid GPU device. <<\n", dev);
351 fprintf(stderr,
"\n");
354 cudaDeviceProp deviceProp;
355 cutilSafeCallNoSync(cudaGetDeviceProperties(&deviceProp, dev));
356 if (deviceProp.major < 1) {
357 FPRINTF((stderr,
"cutil error: GPU device does not support CUDA.\n"));
360 printf(
"> Using CUDA device [%d]: %s\n", dev, deviceProp.name);
361 cutilSafeCall(cudaSetDevice(dev));
367 inline int cutilChooseCudaDevice(
int argc,
char **argv)
369 cudaDeviceProp deviceProp;
372 if( cutCheckCmdLineFlag(argc, (
const char**)argv,
"device") ) {
373 devID = cutilDeviceInit(argc, argv);
375 printf(
"exiting...\n");
376 cutilExit(argc, argv);
381 devID = cutGetMaxGflopsDeviceId();
382 cutilSafeCallNoSync( cudaSetDevice( devID ) );
383 cutilSafeCallNoSync( cudaGetDeviceProperties(&deviceProp, devID) );
384 printf(
"> Using CUDA device [%d]: %s\n", devID, deviceProp.name);
392inline void cutilCudaCheckCtxLost(
const char *errorMessage,
const char *file,
const int line )
394 cudaError_t err = cudaGetLastError();
395 if( cudaSuccess != err) {
396 FPRINTF((stderr,
"%s(%i) : CUDA error: %s : %s.\n",
397 file, line, errorMessage, cudaGetErrorString( err) ));
400 err = cutilDeviceSynchronize();
401 if( cudaSuccess != err) {
402 FPRINTF((stderr,
"%s(%i) : CUDA error: %s : %s.\n",
403 file, line, errorMessage, cudaGetErrorString( err) ));
410#define STRCASECMP _stricmp
412#define STRCASECMP strcasecmp
418#define STRNCASECMP _strnicmp
420#define STRNCASECMP strncasecmp
424inline void __cutilQAFinish(
int argc,
char **argv,
bool bStatus)
426 const char *sStatus[] = {
"FAILED",
"PASSED",
"WAIVED", NULL };
429 for (
int i=1; i < argc; i++) {
430 if (!STRCASECMP(argv[i],
"-qatest") || !STRCASECMP(argv[i],
"-noprompt")) {
436 printf(
"&&&& %s %s", sStatus[bStatus], argv[0]);
437 for (
int i=1; i < argc; i++) printf(
" %s", argv[i]);
440 printf(
"[%s] test result\n%s\n", argv[0], sStatus[bStatus]);
445inline bool cutilCudaCapabilities(
int major_version,
int minor_version,
int argc,
char **argv)
447 cudaDeviceProp deviceProp;
448 deviceProp.major = 0;
449 deviceProp.minor = 0;
452#ifdef __DEVICE_EMULATION__
453 printf(
"> Compute Device Emulation Mode \n");
456 cutilSafeCall( cudaGetDevice(&dev) );
457 cutilSafeCall( cudaGetDeviceProperties(&deviceProp, dev));
459 if((deviceProp.major > major_version) ||
460 (deviceProp.major == major_version && deviceProp.minor >= minor_version)) {
461 printf(
"> Device %d: <%16s >, Compute SM %d.%d detected\n", dev, deviceProp.name, deviceProp.major, deviceProp.minor);
465 printf(
"There is no device supporting CUDA compute capability %d.%d.\n", major_version, minor_version);
466 __cutilQAFinish(argc, argv,
true);