多体問題（1） - inamori’s diary

ネタがないので、浮動小数点数の問題をみてみる。
n個の質点が互いに重力による相互作用を及ぼしあう問題を考える。なるべく多くの質点を用いたいが、ここでは4096個の質点の計算をする。また、これらはトーラス面上にあるとする。1×1の正方形を用意し、上下左右を接合する。周期境界になっていると言ってもいい。質点は正方形からはみ出すと、座標に1を足すか引くかして正方形に収まるようにする。また、質点同士の距離はなるべく近くなるように計算する。すなわち、(0.1, 0.1)と(0.9, 0.9)の距離は、√(0.2²+0.2²)とする。計算しやすいように、質点の質量は1、重力定数も1とする。

4096個のfloatの配列を用意し、位置・速度・加速度のxy成分を割り当てる。初期値は、位置は擬似乱数で与え、速度・加速度は0とする。これをGPUに送り、質点ごとに他の質点との相互作用を計算する。本当は、4096*4095/2回の計算をすればよいが、ここでは4096*4095回の計算をしている。時間刻み幅を0.0001とし、差分計算をし、1回計算したら、CPUにデータを転送する。2回目以降は、GPUにあるデータをそのまま使う。1000回繰り返し計算を行った。

CPUでは235s、GPUでは20.6sかかった。11倍にしかなっていない。
また、doubleの計算と比べるとだんだん座標がずれてくる。このような積み重ねできいてくる計算は、予想通りGPUには厳しい。



#include 

#include 

#include 
double diff_x(double x1, double x0) {

    double  dx = x1 - x0;

    if(dx < -0.5)

        dx += 1.0;

    else if(0.5 < dx)

        dx -= 1.0;

    return dx;

}
void calcAcceleration(double *h_data, int nParticles) {

    // 加速度初期化

    for(int i = 0; i < nParticles; i++) {

        h_data[i*6+4] = 0.0;

        h_data[i*6+5] = 0.0;

    }

    

    for(int i = 0; i < nParticles; i++) {

        double  x = h_data[i*6];

        double  y = h_data[i*6+1];

        for(int k = i + 1; k < nParticles; k++) {

            double  dx = diff_x(h_data[k*6], x);

            double  dy = diff_x(h_data[k*6+1], y);

            double  dist_square = dx * dx + dy * dy;

            double  d = dist_square * sqrt(dist_square);

            double  ax = dx / d;

            double  ay = dy / d;

            h_data[i*6+4] += ax;

            h_data[i*6+5] += ay;

            h_data[k*6+4] -= ax;

            h_data[k*6+5] -= ay;

        }

    }

}
// [ 0, 1 )に

void normalize(double& x) {

    if(x < 0.0)

        x += 1.0;

    else if(x >= 1.0)

        x -= 1.0;

}
void update(double *h_data, int nParticles, double dt) {

    for(int i = 0; i < nParticles; i++) {

        h_data[i*6+2] += h_data[i*6+4] * dt;

        h_data[i*6+3] += h_data[i*6+5] * dt;

        h_data[i*6  ] += h_data[i*6+2] * dt;

        h_data[i*6+1] += h_data[i*6+3] * dt;

        normalize(h_data[i*6]);

        normalize(h_data[i*6+1]);

    }

}
int main() {

    const unsigned int  size_x = 64;

    const unsigned int  size_y = 64;

    const unsigned int  nParticles = size_x * size_y;

    const unsigned int  nData = nParticles * 6;

    const unsigned int  mem_size = sizeof(double) * nData;

    const double        dt = 0.0001;

    const int           nRep = 1000;

    

    double  x[nRep], y[nRep];   // ID0の粒子の位置

    

    double  *h_data = (double *)malloc(mem_size);

    for(int i = 0; i < nParticles; i++) {

        for(int k = 0; k < 2; k++) {

            h_data[i*6+k] = (double)rand() / RAND_MAX;  // 位置

            h_data[i*6+k+2] = 0.0;                      // 速度

        }

    }

    

    LARGE_INTEGER nFreq, nStart, nEnd;

    QueryPerformanceFrequency(&nFreq);

    QueryPerformanceCounter(&nStart);

    

    double  t = 0.0;

    for(int i = 0; i < nRep; i++) {

        calcAcceleration(h_data, nParticles);

        update(h_data, nParticles, dt);

        x[i] = h_data[0];

        y[i] = h_data[1];

        t += dt;

    }

    

    QueryPerformanceCounter(&nEnd);

    

    for(int i = 0; i < nRep; i++) {

        printf("%5.2f %.9f %.9f\n", dt * i, x[i], y[i]);

    }

    

    printf("time: %0.3f ms\n\n",

            (nEnd.QuadPart - nStart.QuadPart) * 1e3 / nFreq.QuadPart);

    

    free(h_data);

}



#include 

#include 

#include 

#include 
#define BLOCK_DIM 16
__device__ double diff_x(float x1, float x0) {

    float   dx = x1 - x0;

    if(dx < -0.5)

        dx += 1.0;

    else if(0.5 < dx)

        dx -= 1.0;

    return dx;

}
__device__ void calcAcceleration(float *data, int nParticles, int k) {

    float   x = data[k*6];

    float   y = data[k*6+1];

    float   ax = 0.0;

    float   ay = 0.0;

    

    for(int i = 0; i < nParticles; i++) {

        if(i == k)

            continue;

        

        float   dx = diff_x(data[i*6], x);

        float   dy = diff_x(data[i*6+1], y);

        float   dist_square = dx * dx + dy * dy;

        float   d = dist_square * sqrt(dist_square);

        ax += dx / d;

        ay += dy / d;

    }

    

    data[k*6+4] = ax;

    data[k*6+5] = ay;

}
// [ 0, 1 )に

__device__ void normalize(float& x) {

    if(x < 0.0)

        x += 1.0;

    else if(x >= 1.0)

        x -= 1.0;

}
__global__ void calc(float *data, int width, int height,

                                        int nParticles, float dt) {

    unsigned int xIndex = blockIdx.x * BLOCK_DIM + threadIdx.x;

    unsigned int yIndex = blockIdx.y * BLOCK_DIM + threadIdx.y;

    if( (xIndex < width) && (yIndex < height)) {

        unsigned int index = yIndex * width + xIndex;

        calcAcceleration(data, nParticles, index);

    }

}
__global__ void update(float *data, int width, int height, float dt) {

    unsigned int xIndex = blockIdx.x * BLOCK_DIM + threadIdx.x;

    unsigned int yIndex = blockIdx.y * BLOCK_DIM + threadIdx.y;

    if( (xIndex < width) && (yIndex < height)) {

        unsigned int index = yIndex * width + xIndex;

        int     k = index * 6;

        data[k+2] += data[k+4] * dt;

        data[k+3] += data[k+5] * dt;

        data[k  ] += data[k+2] * dt;

        data[k+1] += data[k+3] * dt;

        normalize(data[k]);

        normalize(data[k+1]);

    }

}
void runTest( int argc, char** argv);
int main( int argc, char** argv) {

    runTest( argc, argv);

    cutilExit(argc, argv);

}
void runTest( int argc, char** argv) {

    const unsigned int  size_x = 64;

    const unsigned int  size_y = 64;

    const unsigned int  nParticles = size_x * size_y;

    const unsigned int  nData = nParticles * 6;

    const unsigned int  mem_size = sizeof(float) * nData;

    const float         dt = 0.0001;

    const int           nRep = 1000;

    

    double  x[nRep], y[nRep];   // ID0の粒子の位置

    

    unsigned int timer;

    cutCreateTimer(&timer);

    

    if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )

        cutilDeviceInit(argc, argv);

    else

        cudaSetDevice( cutGetMaxGflopsDeviceId() );

    

    // 初期化

    float   *h_data = (float *)malloc(mem_size);

    for(int i = 0; i < nParticles; i++) {

        for(int k = 0; k < 2; k++) {

            h_data[i*6+k] = (double)rand() / RAND_MAX;  // 位置

            h_data[i*6+k+2] = 0.0;                      // 速度

        }

    }

    

    cutStartTimer(timer);

    

    // データをGPUに転送

    float   *d_data;

    cutilSafeCall(cudaMalloc( (void**)&d_data, mem_size));

    cutilSafeCall(cudaMemcpy(d_data, h_data, mem_size, cudaMemcpyHostToDevice));

    

    // setup execution parameters

    dim3 grid(size_x / BLOCK_DIM, size_y / BLOCK_DIM, 1);

    dim3 threads(BLOCK_DIM, BLOCK_DIM, 1);

    

    for(int i = 0; i < nRep; i++) {

        // GPUで計算実行

        calc<<< grid, threads >>>(d_data, size_x, size_y, nParticles, dt);

        cudaThreadSynchronize();

        

        cutilCheckMsg("Kernel execution failed");

        

        update<<< grid, threads >>>(d_data, size_x, size_y, dt);

        cudaThreadSynchronize();

        

        cutilCheckMsg("Kernel execution failed");

        

        // データをGPUから転送

        cutilSafeCall(cudaMemcpy(h_data, d_data, mem_size, cudaMemcpyDeviceToHost));

        x[i] = h_data[0];

        y[i] = h_data[1];

    }

    float   optimizedTime = cutGetTimerValue(timer);

    cutStopTimer(timer);

    

    for(int i = 0; i < nRep; i++) {

        printf("%5.2f %.9f %.9f\n", dt * i, x[i], y[i]);

    }

    

    printf("time: %0.3f ms\n\n", optimizedTime);

    

    // cleanup memory

    free(h_data);

    cutilSafeCall(cudaFree(d_data));

    cutilCheckError( cutDeleteTimer(timer));

    

    cudaThreadExit();

}