/* * Copyright 1993-2012 NVIDIA Corporation. All rights reserved. * * NOTICE TO USER: * * This source code is subject to NVIDIA ownership rights under U.S. and * international Copyright laws. Users and possessors of this source code * are hereby granted a nonexclusive, royalty-free license to use this code * in individual and commercial software. * * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE * OR PERFORMANCE OF THIS SOURCE CODE. * * U.S. Government End Users. This source code is a "commercial item" as * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of * "commercial computer software" and "commercial computer software * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) * and is provided to the U.S. Government only as a commercial end item. * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the * source code with only those rights set forth herein. * * Any use of this source code in individual and commercial software must * include, in the user documentation and internal comments to the code, * the above Disclaimer and U.S. Government End Users Notice. */ /* This example demonstrates how to use the CUBLAS library * by scaling an array of floating-point values on the device * and comparing the result to the same operation performed * on the host. */ /* Includes, system */ #include #include #include /* Includes, cuda */ #include #include //#include /* Matrix size */ #define N (275) /* Host implementation of a simple version of sgemm */ static void simple_sgemm(int n, float alpha, const float *A, const float *B, float beta, float *C) { int i; int j; int k; for (i = 0; i < n; ++i) { for (j = 0; j < n; ++j) { float prod = 0; for (k = 0; k < n; ++k) { prod += A[k * n + i] * B[j * n + k]; } C[j * n + i] = alpha * prod + beta * C[j * n + i]; } } } /* Main */ int main(int argc, char** argv) { cublasStatus_t status; float* h_A; float* h_B; float* h_C; float* h_C_ref; float* d_A = 0; float* d_B = 0; float* d_C = 0; float alpha = 1.0f; float beta = 0.0f; int n2 = N * N; int i; cublasHandle_t handle; /* Initialize CUBLAS */ printf("simpleCUBLAS test running..\n"); cublasCreate(&handle); /* Allocate host memory for the matrices */ h_A = (float*)malloc(n2 * sizeof(h_A[0])); h_B = (float*)malloc(n2 * sizeof(h_B[0])); h_C = (float*)malloc(n2 * sizeof(h_C[0])); /* Fill the matrices with test data */ for (i = 0; i < n2; i++) { h_A[i] = rand() / (float)RAND_MAX; h_B[i] = rand() / (float)RAND_MAX; h_C[i] = rand() / (float)RAND_MAX; } /* Allocate device memory for the matrices */ cudaMalloc((void**)&d_A, n2 * sizeof(d_A[0])); cudaMalloc((void**)&d_B, n2 * sizeof(d_B[0])); cudaMalloc((void**)&d_C, n2 * sizeof(d_C[0])); /* Initialize the device matrices with the host matrices */ cublasSetVector(n2, sizeof(h_A[0]), h_A, 1, d_A, 1); cublasSetVector(n2, sizeof(h_B[0]), h_B, 1, d_B, 1); cublasSetVector(n2, sizeof(h_C[0]), h_C, 1, d_C, 1); /* Performs operation using plain C code */ simple_sgemm(N, alpha, h_A, h_B, beta, h_C); h_C_ref = h_C; /* Performs operation using cublas */ cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A, N, d_B, N, &beta, d_C, N); /* Allocate host memory for reading back the result from device memory */ h_C = (float*)malloc(n2 * sizeof(h_C[0])); /* Read the result back */ cublasGetVector(n2, sizeof(h_C[0]), d_C, 1, h_C, 1); /* Check result against reference */ for (i = 0; i < 10; ++i) { printf("%1.20f %1.20f\n", h_C_ref[i], h_C[i]); } /* Memory clean up */ free(h_A); free(h_B); free(h_C); free(h_C_ref); cudaFree(d_A); cudaFree(d_B); cudaFree(d_C); /* Shutdown */ cublasDestroy(handle); return EXIT_SUCCESS; }