整数の計算

GPUというとふつうはfloatの計算だが、あまり興味はない。やっぱり整数の計算をしたい。

整数の計算速度はどんなもんだろう。適当に加乗除のコードで試してみた。


__global__ void int_add(int *data, int width, int height, int nRep) {
unsigned int xIndex = blockIdx.x * BLOCK_DIM + threadIdx.x;
unsigned int yIndex = blockIdx.y * BLOCK_DIM + threadIdx.y;
if( (xIndex < width) && (yIndex < height)) {
unsigned int index = yIndex * width + xIndex;
int x = data[index];
int result = 0;
for(int i = 0; i < nRep; i++) {
result += (x + i);
}
data[index] = result;
}
}


__global__ void int_mul(int *data, int width, int height, int nRep) {
// ...
int result = 0;
for(int i = 0; i < nRep; i++) {
if(i & 1)
result += x * i;
else
result -= x * i;
}
data[index] = result;
}
}


__global__ void int_div(int *data, int width, int height, int nRep) {
// ...
unsigned int index = yIndex * width + xIndex;
int x = data[index];
int result = 0;
for(int i = 1; i <= nRep; i++) {
result += x / i;
}
data[index] = result;
}
}

これと、CPUとの速度を比較してみた。配列の大きさを256*256、512*512、1024*1024とし、1要素ごとにnRep=256回繰り返している。


add(CPU) add(GPU) mul(CPU) mul(GPU) div(CPU) div(GPU)
256* 256 10.95ms 22.47ms 26.61ms 22.60ms 53.20ms 24.41ms
512* 512 44.01ms 24.52ms 106.47ms 25.91ms 212.85ms 33.92ms
1024*1024 175.16ms 30.76ms 427.99ms 36.07ms 853.53ms 68.44ms

オーバーヘッドを考慮すると、速度はGPU/CPUで、20(add),30(mul),18(div)となった。
やはり整数は遅い。CPUは8コアだとすると、3倍程度にしかならない。ただ、CPUとちがって、GPUはどんどん速くなっていて、もう最新機種だと倍くらい速くなっているらしいし、コストパフォーマンスも全然違う。



// int_add.cpp
#include
#include

int main() {
const unsigned int size_x = 1024;
const unsigned int size_y = 1024;
const unsigned int nData = size_x * size_y;
const unsigned int mem_size = sizeof(int) * nData;
const int nRep = 256;

int *h_data = (int *)malloc(nData * sizeof(int));
for(int i = 0; i < nData; i++) {
h_data[i] = i;
}

LARGE_INTEGER nFreq, nStart, nEnd;
QueryPerformanceFrequency(&nFreq);
QueryPerformanceCounter(&nStart);

for(int i = 0; i < nData; i++) {
int x = h_data[i];
int result = 0;
for(int j = 0; j < nRep; j++) {
result += (x + j);
}
h_data[i] = result;
}

QueryPerformanceCounter(&nEnd);

int sumCPU = 0;
for(int i = 0; i < nData; i++) {
sumCPU += h_data[i] >> 20;
}

printf("time: %0.3f ms\n\n",
(nEnd.QuadPart - nStart.QuadPart) * 1e3 / nFreq.QuadPart);
printf("CPU : %d\n", sumCPU);

free(h_data);
}


// int_add.cu
#include
#include
#include
#include

#define BLOCK_DIM 16

__global__ void int_add(int *data, int width, int height, int nRep) {
unsigned int xIndex = blockIdx.x * BLOCK_DIM + threadIdx.x;
unsigned int yIndex = blockIdx.y * BLOCK_DIM + threadIdx.y;
if( (xIndex < width) && (yIndex < height)) {
unsigned int index = yIndex * width + xIndex;
int x = data[index];
int result = 0;
for(int i = 0; i < nRep; i++) {
result += (x + i);
}
data[index] = result;
}
}

void runTest( int argc, char** argv);

int main( int argc, char** argv) {
runTest( argc, argv);
cutilExit(argc, argv);
}

void runTest( int argc, char** argv) {
const unsigned int size_x = 256;
const unsigned int size_y = 256;
const unsigned int nData = size_x * size_y;
const unsigned int mem_size = sizeof(int) * nData;
const int nRep = 256;

unsigned int timer;
cutCreateTimer(&timer);

if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )
cutilDeviceInit(argc, argv);
else
cudaSetDevice( cutGetMaxGflopsDeviceId() );

int *h_data = (int *)malloc(nData * sizeof(int));
for(unsigned int i = 0; i < nData; i++) {
h_data[i] = i;
}

cutStartTimer(timer);

// データをGPUに転送
int *d_data;
cutilSafeCall(cudaMalloc( (void**)&d_data, mem_size));
cutilSafeCall(cudaMemcpy(d_data, h_data, mem_size, cudaMemcpyHostToDevice));

// setup execution parameters
dim3 grid(size_x / BLOCK_DIM, size_y / BLOCK_DIM, 1);
dim3 threads(BLOCK_DIM, BLOCK_DIM, 1);

// GPUで計算実行
int_add<<< grid, threads >>>(d_data, size_x, size_y, nRep);
cudaThreadSynchronize();

cutilCheckMsg("Kernel execution failed");

// データをGPUから転送
cutilSafeCall(cudaMemcpy(h_data, d_data, mem_size, cudaMemcpyDeviceToHost));
float optimizedTime = cutGetTimerValue(timer);
cutStopTimer(timer);

int sumGPU = 0;
for(int i = 0; i < nData; i++) {
sumGPU += h_data[i] >> 20;
}
printf("time: %0.3f ms\n\n", optimizedTime);
printf("GPU : %d\n", sumGPU);

// cleanup memory
free(h_data);
cutilSafeCall(cudaFree(d_data));
cutilCheckError( cutDeleteTimer(timer));

cudaThreadExit();
}


// int_mul.cpp
#include
#include

int main() {
const unsigned int size_x = 256;
const unsigned int size_y = 256;
const unsigned int nData = size_x * size_y;
const unsigned int mem_size = sizeof(int) * nData;
const int nRep = 256;

int *h_data = (int *)malloc(nData * sizeof(int));
for(int i = 0; i < nData; i++) {
h_data[i] = i;
}

LARGE_INTEGER nFreq, nStart, nEnd;
QueryPerformanceFrequency(&nFreq);
QueryPerformanceCounter(&nStart);

for(int i = 0; i < nData; i++) {
int x = h_data[i];
int result = 0;
for(int j = 0; j < nRep; j++) {
if(j & 1)
result += x * j;
else
result -= x * j;
}
h_data[i] = result;
}

QueryPerformanceCounter(&nEnd);

int sumCPU = 0;
for(int i = 0; i < nData; i++) {
sumCPU += h_data[i] >> 20;
}

printf("time: %0.3f ms\n\n",
(nEnd.QuadPart - nStart.QuadPart) * 1e3 / nFreq.QuadPart);
printf("CPU : %d\n", sumCPU);

free(h_data);
}


// int_mul.cu
#include
#include
#include
#include

#define BLOCK_DIM 16

__global__ void int_mul(int *data, int width, int height, int nRep) {
unsigned int xIndex = blockIdx.x * BLOCK_DIM + threadIdx.x;
unsigned int yIndex = blockIdx.y * BLOCK_DIM + threadIdx.y;
if( (xIndex < width) && (yIndex < height)) {
unsigned int index = yIndex * width + xIndex;
int x = data[index];
int result = 0;
for(int i = 0; i < nRep; i++) {
if(i & 1)
result += x * i;
else
result -= x * i;
}
data[index] = result;
}
}

void runTest( int argc, char** argv);

int main( int argc, char** argv) {
runTest( argc, argv);
cutilExit(argc, argv);
}

void runTest( int argc, char** argv) {
const unsigned int size_x = 1024;
const unsigned int size_y = 1024;
const unsigned int nData = size_x * size_y;
const unsigned int mem_size = sizeof(int) * nData;
const int nRep = 256;

unsigned int timer;
cutCreateTimer(&timer);

if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )
cutilDeviceInit(argc, argv);
else
cudaSetDevice( cutGetMaxGflopsDeviceId() );

int *h_data = (int *)malloc(nData * sizeof(int));
for(unsigned int i = 0; i < nData; i++) {
h_data[i] = i;
}

cutStartTimer(timer);

// データをGPUに転送
int *d_data;
cutilSafeCall(cudaMalloc( (void**)&d_data, mem_size));
cutilSafeCall(cudaMemcpy(d_data, h_data, mem_size, cudaMemcpyHostToDevice));

// setup execution parameters
dim3 grid(size_x / BLOCK_DIM, size_y / BLOCK_DIM, 1);
dim3 threads(BLOCK_DIM, BLOCK_DIM, 1);

// GPUで計算実行
int_mul<<< grid, threads >>>(d_data, size_x, size_y, nRep);
cudaThreadSynchronize();

cutilCheckMsg("Kernel execution failed");

// データをGPUから転送
cutilSafeCall(cudaMemcpy(h_data, d_data, mem_size, cudaMemcpyDeviceToHost));
float optimizedTime = cutGetTimerValue(timer);
cutStopTimer(timer);

int sumGPU = 0;
for(int i = 0; i < nData; i++) {
sumGPU += h_data[i] >> 20;
}
printf("time: %0.3f ms\n\n", optimizedTime);
printf("GPU : %d\n", sumGPU);

// cleanup memory
free(h_data);
cutilSafeCall(cudaFree(d_data));
cutilCheckError( cutDeleteTimer(timer));

cudaThreadExit();
}


// int_div.cpp
#include
#include

int main() {
const unsigned int size_x = 1024;
const unsigned int size_y = 1024;
const unsigned int nData = size_x * size_y;
const unsigned int mem_size = sizeof(int) * nData;
const int nRep = 256;

int *h_data = (int *)malloc(nData * sizeof(int));
for(int i = 0; i < nData; i++) {
h_data[i] = i;
}

LARGE_INTEGER nFreq, nStart, nEnd;
QueryPerformanceFrequency(&nFreq);
QueryPerformanceCounter(&nStart);

for(int i = 0; i < nData; i++) {
int x = h_data[i];
int result = 0;
for(int j = 1; j <= nRep; j++) {
result += x / j;
}
h_data[i] = result;
}

QueryPerformanceCounter(&nEnd);

int sumCPU = 0;
for(int i = 0; i < nData; i++) {
sumCPU += h_data[i] >> 10;
}

printf("time: %0.3f ms\n\n",
(nEnd.QuadPart - nStart.QuadPart) * 1e3 / nFreq.QuadPart);
printf("CPU : %d\n", sumCPU);

free(h_data);
}


// int_div.cu
#include
#include
#include
#include

#define BLOCK_DIM 16

__global__ void int_div(int *data, int width, int height, int nRep) {
unsigned int xIndex = blockIdx.x * BLOCK_DIM + threadIdx.x;
unsigned int yIndex = blockIdx.y * BLOCK_DIM + threadIdx.y;
if xIndex < width) && (yIndex < height)) {
unsigned int index = yIndex * width + xIndex;
int x = data[index];
int result = 0;
for(int i = 1; i <= nRep; i++) {
result += x / i;
}
data[index] = result;
}
}

void runTest( int argc, char** argv);

int main( int argc, char** argv) {
runTest( argc, argv);
cutilExit(argc, argv);
}

void runTest( int argc, char** argv) {
const unsigned int size_x = 256;
const unsigned int size_y = 256;
const unsigned int nData = size_x * size_y;
const unsigned int mem_size = sizeof(int) * nData;
const int nRep = 256;

unsigned int timer;
cutCreateTimer(&timer);

if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )
cutilDeviceInit(argc, argv);
else
cudaSetDevice( cutGetMaxGflopsDeviceId() );

int *h_data = (int *)malloc(nData * sizeof(int));
for(unsigned int i = 0; i < nData; i++) {
h_data[i] = i;
}

cutStartTimer(timer);

// データをGPUに転送
int *d_data;
cutilSafeCall(cudaMalloc( (void**)&d_data, mem_size));
cutilSafeCall(cudaMemcpy(d_data, h_data, mem_size, cudaMemcpyHostToDevice));

// setup execution parameters
dim3 grid(size_x / BLOCK_DIM, size_y / BLOCK_DIM, 1);
dim3 threads(BLOCK_DIM, BLOCK_DIM, 1);

// GPUで計算実行
int_div<<< grid, threads >>>(d_data, size_x, size_y, nRep);
cudaThreadSynchronize();

cutilCheckMsg("Kernel execution failed");

// データをGPUから転送
cutilSafeCall(cudaMemcpy(h_data, d_data, mem_size, cudaMemcpyDeviceToHost));
float optimizedTime = cutGetTimerValue(timer);
cutStopTimer(timer);

int sumGPU = 0;
for(int i = 0; i < nData; i++) {
sumGPU += h_data[i] >> 10;
}
printf("time: %0.3f ms\n\n", optimizedTime);
printf("GPU : %d\n", sumGPU);

// cleanup memory
free(h_data);
cutilSafeCall(cudaFree(d_data));
cutilCheckError( cutDeleteTimer(timer));

cudaThreadExit();
}