GPUというとふつうはfloatの計算だが、あまり興味はない。やっぱり整数の計算をしたい。
整数の計算速度はどんなもんだろう。適当に加乗除のコードで試してみた。
__global__ void int_add(int *data, int width, int height, int nRep) {
unsigned int xIndex = blockIdx.x * BLOCK_DIM + threadIdx.x;
unsigned int yIndex = blockIdx.y * BLOCK_DIM + threadIdx.y;
if( (xIndex < width) && (yIndex < height)) {
unsigned int index = yIndex * width + xIndex;
int x = data[index];
int result = 0;
for(int i = 0; i < nRep; i++) {
result += (x + i);
}
data[index] = result;
}
}
__global__ void int_mul(int *data, int width, int height, int nRep) {
// ...
int result = 0;
for(int i = 0; i < nRep; i++) {
if(i & 1)
result += x * i;
else
result -= x * i;
}
data[index] = result;
}
}
__global__ void int_div(int *data, int width, int height, int nRep) {
// ...
unsigned int index = yIndex * width + xIndex;
int x = data[index];
int result = 0;
for(int i = 1; i <= nRep; i++) {
result += x / i;
}
data[index] = result;
}
}
これと、CPUとの速度を比較してみた。配列の大きさを256*256、512*512、1024*1024とし、1要素ごとにnRep=256回繰り返している。
add(CPU) add(GPU) mul(CPU) mul(GPU) div(CPU) div(GPU)
256* 256 10.95ms 22.47ms 26.61ms 22.60ms 53.20ms 24.41ms
512* 512 44.01ms 24.52ms 106.47ms 25.91ms 212.85ms 33.92ms
1024*1024 175.16ms 30.76ms 427.99ms 36.07ms 853.53ms 68.44ms
オーバーヘッドを考慮すると、速度はGPU/CPUで、20(add),30(mul),18(div)となった。
やはり整数は遅い。CPUは8コアだとすると、3倍程度にしかならない。ただ、CPUとちがって、GPUはどんどん速くなっていて、もう最新機種だと倍くらい速くなっているらしいし、コストパフォーマンスも全然違う。
// int_add.cpp
#include
#includeint main() {
const unsigned int size_x = 1024;
const unsigned int size_y = 1024;
const unsigned int nData = size_x * size_y;
const unsigned int mem_size = sizeof(int) * nData;
const int nRep = 256;
int *h_data = (int *)malloc(nData * sizeof(int));
for(int i = 0; i < nData; i++) {
h_data[i] = i;
}
LARGE_INTEGER nFreq, nStart, nEnd;
QueryPerformanceFrequency(&nFreq);
QueryPerformanceCounter(&nStart);
for(int i = 0; i < nData; i++) {
int x = h_data[i];
int result = 0;
for(int j = 0; j < nRep; j++) {
result += (x + j);
}
h_data[i] = result;
}
QueryPerformanceCounter(&nEnd);
int sumCPU = 0;
for(int i = 0; i < nData; i++) {
sumCPU += h_data[i] >> 20;
}
printf("time: %0.3f ms\n\n",
(nEnd.QuadPart - nStart.QuadPart) * 1e3 / nFreq.QuadPart);
printf("CPU : %d\n", sumCPU);
free(h_data);
}
// int_add.cu
#include
#include
#include
#include#define BLOCK_DIM 16
__global__ void int_add(int *data, int width, int height, int nRep) {
unsigned int xIndex = blockIdx.x * BLOCK_DIM + threadIdx.x;
unsigned int yIndex = blockIdx.y * BLOCK_DIM + threadIdx.y;
if( (xIndex < width) && (yIndex < height)) {
unsigned int index = yIndex * width + xIndex;
int x = data[index];
int result = 0;
for(int i = 0; i < nRep; i++) {
result += (x + i);
}
data[index] = result;
}
}void runTest( int argc, char** argv);
int main( int argc, char** argv) {
runTest( argc, argv);
cutilExit(argc, argv);
}void runTest( int argc, char** argv) {
const unsigned int size_x = 256;
const unsigned int size_y = 256;
const unsigned int nData = size_x * size_y;
const unsigned int mem_size = sizeof(int) * nData;
const int nRep = 256;
unsigned int timer;
cutCreateTimer(&timer);
if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )
cutilDeviceInit(argc, argv);
else
cudaSetDevice( cutGetMaxGflopsDeviceId() );
int *h_data = (int *)malloc(nData * sizeof(int));
for(unsigned int i = 0; i < nData; i++) {
h_data[i] = i;
}
cutStartTimer(timer);
// データをGPUに転送
int *d_data;
cutilSafeCall(cudaMalloc( (void**)&d_data, mem_size));
cutilSafeCall(cudaMemcpy(d_data, h_data, mem_size, cudaMemcpyHostToDevice));
// setup execution parameters
dim3 grid(size_x / BLOCK_DIM, size_y / BLOCK_DIM, 1);
dim3 threads(BLOCK_DIM, BLOCK_DIM, 1);
// GPUで計算実行
int_add<<< grid, threads >>>(d_data, size_x, size_y, nRep);
cudaThreadSynchronize();
cutilCheckMsg("Kernel execution failed");
// データをGPUから転送
cutilSafeCall(cudaMemcpy(h_data, d_data, mem_size, cudaMemcpyDeviceToHost));
float optimizedTime = cutGetTimerValue(timer);
cutStopTimer(timer);
int sumGPU = 0;
for(int i = 0; i < nData; i++) {
sumGPU += h_data[i] >> 20;
}
printf("time: %0.3f ms\n\n", optimizedTime);
printf("GPU : %d\n", sumGPU);
// cleanup memory
free(h_data);
cutilSafeCall(cudaFree(d_data));
cutilCheckError( cutDeleteTimer(timer));
cudaThreadExit();
}
// int_mul.cpp
#include
#includeint main() {
const unsigned int size_x = 256;
const unsigned int size_y = 256;
const unsigned int nData = size_x * size_y;
const unsigned int mem_size = sizeof(int) * nData;
const int nRep = 256;
int *h_data = (int *)malloc(nData * sizeof(int));
for(int i = 0; i < nData; i++) {
h_data[i] = i;
}
LARGE_INTEGER nFreq, nStart, nEnd;
QueryPerformanceFrequency(&nFreq);
QueryPerformanceCounter(&nStart);
for(int i = 0; i < nData; i++) {
int x = h_data[i];
int result = 0;
for(int j = 0; j < nRep; j++) {
if(j & 1)
result += x * j;
else
result -= x * j;
}
h_data[i] = result;
}
QueryPerformanceCounter(&nEnd);
int sumCPU = 0;
for(int i = 0; i < nData; i++) {
sumCPU += h_data[i] >> 20;
}
printf("time: %0.3f ms\n\n",
(nEnd.QuadPart - nStart.QuadPart) * 1e3 / nFreq.QuadPart);
printf("CPU : %d\n", sumCPU);
free(h_data);
}
// int_mul.cu
#include
#include
#include
#include#define BLOCK_DIM 16
__global__ void int_mul(int *data, int width, int height, int nRep) {
unsigned int xIndex = blockIdx.x * BLOCK_DIM + threadIdx.x;
unsigned int yIndex = blockIdx.y * BLOCK_DIM + threadIdx.y;
if( (xIndex < width) && (yIndex < height)) {
unsigned int index = yIndex * width + xIndex;
int x = data[index];
int result = 0;
for(int i = 0; i < nRep; i++) {
if(i & 1)
result += x * i;
else
result -= x * i;
}
data[index] = result;
}
}void runTest( int argc, char** argv);
int main( int argc, char** argv) {
runTest( argc, argv);
cutilExit(argc, argv);
}void runTest( int argc, char** argv) {
const unsigned int size_x = 1024;
const unsigned int size_y = 1024;
const unsigned int nData = size_x * size_y;
const unsigned int mem_size = sizeof(int) * nData;
const int nRep = 256;
unsigned int timer;
cutCreateTimer(&timer);
if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )
cutilDeviceInit(argc, argv);
else
cudaSetDevice( cutGetMaxGflopsDeviceId() );
int *h_data = (int *)malloc(nData * sizeof(int));
for(unsigned int i = 0; i < nData; i++) {
h_data[i] = i;
}
cutStartTimer(timer);
// データをGPUに転送
int *d_data;
cutilSafeCall(cudaMalloc( (void**)&d_data, mem_size));
cutilSafeCall(cudaMemcpy(d_data, h_data, mem_size, cudaMemcpyHostToDevice));
// setup execution parameters
dim3 grid(size_x / BLOCK_DIM, size_y / BLOCK_DIM, 1);
dim3 threads(BLOCK_DIM, BLOCK_DIM, 1);
// GPUで計算実行
int_mul<<< grid, threads >>>(d_data, size_x, size_y, nRep);
cudaThreadSynchronize();
cutilCheckMsg("Kernel execution failed");
// データをGPUから転送
cutilSafeCall(cudaMemcpy(h_data, d_data, mem_size, cudaMemcpyDeviceToHost));
float optimizedTime = cutGetTimerValue(timer);
cutStopTimer(timer);
int sumGPU = 0;
for(int i = 0; i < nData; i++) {
sumGPU += h_data[i] >> 20;
}
printf("time: %0.3f ms\n\n", optimizedTime);
printf("GPU : %d\n", sumGPU);
// cleanup memory
free(h_data);
cutilSafeCall(cudaFree(d_data));
cutilCheckError( cutDeleteTimer(timer));
cudaThreadExit();
}
// int_div.cpp
#include
#includeint main() {
const unsigned int size_x = 1024;
const unsigned int size_y = 1024;
const unsigned int nData = size_x * size_y;
const unsigned int mem_size = sizeof(int) * nData;
const int nRep = 256;
int *h_data = (int *)malloc(nData * sizeof(int));
for(int i = 0; i < nData; i++) {
h_data[i] = i;
}
LARGE_INTEGER nFreq, nStart, nEnd;
QueryPerformanceFrequency(&nFreq);
QueryPerformanceCounter(&nStart);
for(int i = 0; i < nData; i++) {
int x = h_data[i];
int result = 0;
for(int j = 1; j <= nRep; j++) {
result += x / j;
}
h_data[i] = result;
}
QueryPerformanceCounter(&nEnd);
int sumCPU = 0;
for(int i = 0; i < nData; i++) {
sumCPU += h_data[i] >> 10;
}
printf("time: %0.3f ms\n\n",
(nEnd.QuadPart - nStart.QuadPart) * 1e3 / nFreq.QuadPart);
printf("CPU : %d\n", sumCPU);
free(h_data);
}
// int_div.cu
#include
#include
#include
#include#define BLOCK_DIM 16
__global__ void int_div(int *data, int width, int height, int nRep) {
unsigned int xIndex = blockIdx.x * BLOCK_DIM + threadIdx.x;
unsigned int yIndex = blockIdx.y * BLOCK_DIM + threadIdx.y;
if xIndex < width) && (yIndex < height)) {
unsigned int index = yIndex * width + xIndex;
int x = data[index];
int result = 0;
for(int i = 1; i <= nRep; i++) {
result += x / i;
}
data[index] = result;
}
}void runTest( int argc, char** argv);
int main( int argc, char** argv) {
runTest( argc, argv);
cutilExit(argc, argv);
}void runTest( int argc, char** argv) {
const unsigned int size_x = 256;
const unsigned int size_y = 256;
const unsigned int nData = size_x * size_y;
const unsigned int mem_size = sizeof(int) * nData;
const int nRep = 256;
unsigned int timer;
cutCreateTimer(&timer);
if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )
cutilDeviceInit(argc, argv);
else
cudaSetDevice( cutGetMaxGflopsDeviceId() );
int *h_data = (int *)malloc(nData * sizeof(int));
for(unsigned int i = 0; i < nData; i++) {
h_data[i] = i;
}
cutStartTimer(timer);
// データをGPUに転送
int *d_data;
cutilSafeCall(cudaMalloc( (void**)&d_data, mem_size));
cutilSafeCall(cudaMemcpy(d_data, h_data, mem_size, cudaMemcpyHostToDevice));
// setup execution parameters
dim3 grid(size_x / BLOCK_DIM, size_y / BLOCK_DIM, 1);
dim3 threads(BLOCK_DIM, BLOCK_DIM, 1);
// GPUで計算実行
int_div<<< grid, threads >>>(d_data, size_x, size_y, nRep);
cudaThreadSynchronize();
cutilCheckMsg("Kernel execution failed");
// データをGPUから転送
cutilSafeCall(cudaMemcpy(h_data, d_data, mem_size, cudaMemcpyDeviceToHost));
float optimizedTime = cutGetTimerValue(timer);
cutStopTimer(timer);
int sumGPU = 0;
for(int i = 0; i < nData; i++) {
sumGPU += h_data[i] >> 10;
}
printf("time: %0.3f ms\n\n", optimizedTime);
printf("GPU : %d\n", sumGPU);
// cleanup memory
free(h_data);
cutilSafeCall(cudaFree(d_data));
cutilCheckError( cutDeleteTimer(timer));
cudaThreadExit();
}