GPGPUで多倍長整数計算(3)

10進変換(3)

前回、16ビットずつ掛け算していったが、2100000 = (250000)2だから、250000を計算してそれを自乗したほうが速いような気がする。さらに、250000も同様に。


実際に組んでみたら、倍くらいの時間がかかるようになった。
ただし、この掛け算はGPUが得意な計算なので、GPUにやらせれば速くなる。今回のコードは、GPUを使いやすいように組んであるので遅い(というか、GPU版をCPU版に変更して作った)。



#include
#include
#include

typedef unsigned int uint;

const int max_bit = 32; // 配列の要素当たりのビット数
const int max_dec_exp = 4;
const int max_dec = 10000;
const int multi_exp = 16;

int trim(uint a, int k);
void multiple(uint *ary, int& n);
void add(int a, uint *ary, int& n);
void calcWH(int& width, int& height, int n);
int calcBit(int n);
void calc(float *data, int bit, int num_elem);

int main(int argc, char **argv) {
int nexp;
try {
if(argc != 2)
throw(1);
nexp = atoi(argv[1]);
if(nexp <= 0)
throw(1);
}
catch(...) {
fprintf(stderr, "usage : bin n.\n");
fprintf(stderr, " n : natural number\n");
}

// ここから計算
uint *ary_bin;
uint *ary_dec;

// 何度開平できるか
int nRep = 0;
{
const int limit = 32;
int n = nexp;
while((n & 1) == 0 && n > limit) {
n >>= 1;
nRep++;
}
}

// 開平後の2進データを作る
uint nbin = (nexp >> nRep) / max_bit + 1;
int ibin = (nexp >> nRep) % max_bit;
ary_bin = new uint[nbin];
for(int l = 0; l < nbin - 1; l++)
ary_bin[l] = 0;
ary_bin[nbin-1] = 1 << ibin;

// 開平後の10進データを作る
uint ndec = (int)(nexp / (log((double)max_dec) - log(2.0))) + 1;
ary_dec = new uint[ndec];
int max_dec_elem = 0;

int i = nbin - 1;
int k = ibin / multi_exp;
ary_dec[0] = trim(ary_bin[i], k);
if(ary_dec[0] >= max_dec) {
ary_dec[1] = ary_dec[0] / max_dec;
ary_dec[0] -= ary_dec[1] * max_dec;
max_dec_elem = 1;
}

k--;
for( ; i >= 0; i--) {
for( ; k >= 0; k--) {
multiple(ary_dec, max_dec_elem);
int n = trim(ary_bin[i], k);
add(n, ary_dec, max_dec_elem);
}
k = max_bit / multi_exp - 1;
}

// 必要な回数平方する
int nWidth, nHeight;
calcWH(nWidth, nHeight, ndec);
int nData = nWidth * nHeight * 4;

// 配列をintからfloatに
float *data = new float[ndec*4*2];
for(int i = 0; i <= max_dec_elem; i++)
data[i*4] = (float)ary_dec[i];

for(int iRep = 0; iRep < nRep; iRep++) {
int bit = calcBit(nWidth);
int num_elem = max_dec_elem * 2 + 1;
calcWH(nWidth, nHeight, num_elem);
calc(data, bit, num_elem);

// 繰上り計算
max_dec_elem *= 2;
int carry[2];
carry[0] = carry[1] = 0;
for(int i = 0; i <= max_dec_elem; i++) {
const int div22 = (1 << 22) / max_dec;
const int mod22 = (1 << 22) % max_dec;
int res = (int)data[i*4+1];
int div = res / max_dec;
int mod = res - div * max_dec;
res = mod * mod22 + carry[0];
carry[0] = div * mod22 + mod * div22 + carry[1];
carry[1] = div * div22;

res += (int)data[i*4];
div = res / max_dec;
mod = res - div * max_dec;
data[i*4] = (float)mod;
carry[0] += div;
}
if(carry[0] != 0) {
max_dec_elem++;
data[max_dec_elem*4] = (float)carry[0];
}
}

fprintf(stdout, "%4d", (int)data[max_dec_elem*4]);
for(int i = max_dec_elem - 1; i >= 0; i--)
fprintf(stdout, " %04d", (int)data[i*4]);
fprintf(stdout, "\n");
delete [] ary_bin;
delete [] ary_dec;
delete [] data;

return 0;
}

int trim(uint a, int k) {
int l = multi_exp * k;
return (a >> l) & ((1 << multi_exp) - 1);
}

void multiple(uint *ary, int& n) {
int carry = 0;
for(int i = 0; i <= n; i++) {
ary[i] <<= multi_exp;
ary[i] += carry;
carry = ary[i] / max_dec;
ary[i] -= carry * max_dec;
}

if(carry != 0) {
if(carry >= max_dec) {
n += 2;
ary[n] = carry / max_dec;
ary[n-1] = carry - ary[n] * max_dec;
}
else {
n++;
ary[n] = carry;
}
}
}

void add(int a, uint *ary, int& n) {
int carry = a;
for(int i = 0; i <= n; i++) {
ary[i] += carry;
carry = ary[i] / max_dec;
if(carry != 0)
ary[i] -= carry * max_dec;
else
break;
}

if(carry) {
n++;
ary[n] = 1;
}
}

void calcWH(int& width, int& height, int n) {
if(n == 0) {
width = height = 0;
return;
}

width = height = 1;
n--;
while(n) {
width <<= 1;
n >>= 1;
if(n == 0)
break;
height <<= 1;
n >>= 1;
}
}

int calcBit(int n) {
int bit = 0;
while(n > 1) {
bit++;
n >>= 1;
}
return bit;
}

void calc(float *data, int bit, int num_elem) {
const int max_bit = 30;
const int max_bin = 1 << max_bit;
const int max_color_bit = 22;

float *result = new float[num_elem*4];
for(int n = 0; n < num_elem; n++) {
int c[2];
c[0] = c[1] = 0;

int iBegin, iEnd;
iBegin = max(0, n - num_elem / 2);
iEnd = (n + 1) / 2;

int mask = (1 << bit) - 1;
if((n & 1) == 0) {
int m = n >> 1;
int a = (int)data[m*4];
c[0] += a * a;
}

for(int i = iBegin; i < iEnd; i++) {
int k = n - i;
int a = (int)data[i*4];
int b = (int)data[k*4];

c[0] += a * b * 2;
if(c[0] >= max_bin) {
c[0] -= max_bin;
c[1]++;
}
}

// 22bit区切りにして出力
c[1] = (c[1] << (max_bit - max_color_bit)) + (c[0] >> max_color_bit);
c[0] = c[0] & ((1 << max_color_bit) - 1);
result[n*4] = (float)c[0];
result[n*4+1] = (float)c[1];
}

for(int n = 0; n < num_elem; n++) {
data[n*4] = result[n*4];
data[n*4+1] = result[n*4+1];
}

delete [] result;
}