GPGPUで多倍長整数計算(4)

10進変換(4)

前回、ある程度小さい数を10進数にして、それを自乗していくという方法を考えた。今回は、その自乗をGPUで行う。


多倍長整数の乗算は、多項式の乗算と同じようなものだから、1ピクセル(一つの項)ごとに大量の乗算・加算をする。だからGPUでやらせると猛烈に速くなる。

グラフの青いほうは従来の(前々回の)方法、赤いほうがGPUを使った方法である。
2nをできるだけ開平する。ただし1万以下なら開平しない。そこから自乗をGPUで行う。数万乗なら効果はないが、160万乗だと実に50倍速くなった。


もっとも、この方法はたいていの場合使えない。



#extension GL_ARB_texture_rectangle : enable
#extension GL_EXT_gpu_shader4 : enable

uniform sampler2DRect texUnit0;
uniform sampler2DRect texUnit1;

uniform int bit;
uniform int num_elem;

const int max_bit = 30;
const int max_bin = 1 << max_bit;
const int max_color_bit = 22;

void main() {
int n = int(gl_FragCoord.x) + (int(gl_FragCoord.y) << bit);
int c[2];
c[0] = c[1] = 0;

if(n >= num_elem)
return;

int iBegin, iEnd;
iBegin = max(0, n - num_elem / 2);
iEnd = (n + 1) / 2;

int mask = (1 << bit) - 1;
if((n & 1) == 0) {
int m = n >> 1;
vec2 coord_a = vec2(m & mask, m >> bit);
int a = int(texRECT(texUnit1, coord_a).x);
c[0] += a * a;
}

for(int i = iBegin; i < iEnd; i++) {
int k = n - i;
vec2 coord_a = vec2(i & mask, i >> bit);
vec2 coord_b = vec2(k & mask, k >> bit);
int a = int(texRECT(texUnit1, coord_a).x);
int b = int(texRECT(texUnit1, coord_b).x);

c[0] += a * b * 2;
if(c[0] >= max_bin) {
c[0] -= max_bin;
c[1]++;
}
}

// 22bit区切りにして出力
c[1] = (c[1] << (max_bit - max_color_bit)) + (c[0] >> max_color_bit);
c[0] = c[0] & ((1 << max_color_bit) - 1);
gl_FragColor = vec4(c[0], c[1], iBegin, iEnd);
}


#include
#include
#define _USE_MATH_DEFINES
#include
#include
#include
#include

using namespace std;

#pragma comment (lib, "glew32.lib")

// GLSLシェーダの読み込み(CreateShaderから呼び出す)
int LoadGlslShader(GLhandleARB *handle, const char *filename)
{
int begin, end;
char *str;
FILE *F;
F = fopen(filename, "r");
if(F == NULL)
{
return -1;
}
begin = ftell(F);
fseek(F, 0, SEEK_END);
end = ftell(F);
fseek(F, 0, SEEK_SET);
str = (char*)calloc( (end-begin), sizeof(char));
fread((void*)str, end-begin, 1, F);
fclose(F);
glShaderSourceARB(*handle, 1, (const char**)(&str), NULL);
free(str);
return 0;
}

// GLSLシェーダの生成
int CreateShader(GLhandleARB& g_programObject,
GLint& texUnit0, const char *filename) {
int ret;
GLenum errCode;

g_programObject = glCreateProgramObjectARB();

GLhandleARB g_fragmentShader;
g_fragmentShader = glCreateShaderObjectARB(GL_FRAGMENT_SHADER_ARB);
ret = LoadGlslShader(&g_fragmentShader, filename);
if(ret)
return -1;

glCompileShaderARB(g_fragmentShader);
glGetObjectParameterivARB(g_fragmentShader, GL_OBJECT_COMPILE_STATUS_ARB, &ret);
if( (errCode=glGetError()) != GL_NO_ERROR || ret == GL_FALSE) {
char str[1024];
int length = 0;
glGetInfoLogARB(g_fragmentShader, 0xff, &length, str);
fprintf(stderr, "Compile Failed: %s\n", filename);
fprintf(stderr, "%s\n", str);
return -1;
}
glAttachObjectARB(g_programObject, g_fragmentShader);
if( (errCode=glGetError()) != GL_NO_ERROR) {
return -1;
}

glLinkProgramARB(g_programObject);
glGetObjectParameterivARB(g_programObject, GL_OBJECT_LINK_STATUS_ARB, &ret);
if( (errCode=glGetError()) != GL_NO_ERROR || ret == GL_FALSE) {
return -1;
}

texUnit0 = glGetUniformLocationARB(g_programObject, "texUnit0");
return 0;
}

// テクスチャ生成
int CreateTexture(int texid, int width, int height, unsigned short opt1, unsigned short opt2)
{
glBindTexture(opt1, texid);
glTexParameteri(opt1, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
glTexParameteri(opt1, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
glTexParameteri(opt1, GL_TEXTURE_WRAP_S, GL_CLAMP);
glTexParameteri(opt1, GL_TEXTURE_WRAP_T, GL_CLAMP);
glTexImage2D(opt1, 0, opt2, width, height, 0, GL_RGBA, GL_FLOAT, 0);
if (glGetError() != GL_NO_ERROR) {
return -1;
}
return 0;
}

void setParameter1i(GLhandleARB g_programObject, const char *name, int n) {
int h = glGetUniformLocationARB(g_programObject, name);
glUniform1iARB(h, n);
}

typedef unsigned int uint;

const int max_bit = 32; // 配列の要素当たりのビット数
const int max_dec_exp = 4;
const int max_dec = 10000;
const int multi_exp = 16;

int trim(uint a, int k);
void multiple(uint *ary, int& n);
void add(int a, uint *ary, int& n);
void calcWH(int& width, int& height, int n);
int calcBit(int n);

void twice(uint *ary, int& n) {
int carry = 0;
for(int i = 0; i <= n; i++) {
ary[i] <<= 1;
ary[i] += carry;
if(ary[i] >= max_dec) {
ary[i] -= max_dec;
carry = 1;
}
else {
carry = 0;
}
}

if(carry == 1) {
n++;
ary[n] = 1;
}
}

void increment(uint *ary, int& n) {
bool carry = true;
for(int i = 0; i <= n; i++) {
ary[i]++;
if(ary[i] >= max_dec)
ary[i] = 0;
else
break;
}

if(carry) {
n++;
ary[n] = 1;
}
}

int main(int argc, char **argv) {
int nexp;
try {
if(argc != 2)
throw(1);
nexp = atoi(argv[1]);
if(nexp <= 0)
throw(1);
}
catch(...) {
fprintf(stderr, "usage : bin n.\n");
fprintf(stderr, " n : natural number\n");
}

glutInit( &argc, argv );
glutInitWindowSize(1024, 1024);
unsigned int g_glutWindowHandle;
g_glutWindowHandle = glutCreateWindow("GpgpuHelloWorld");
glewInit();

unsigned int g_fb = -1;
glGenFramebuffersEXT(1, &g_fb);
glBindFramebufferEXT(GL_FRAMEBUFFER_EXT, g_fb);

GLhandleARB g_programObject;
GLint texUnit0, texUnit1;
unsigned int g_nTexID[2];
CreateShader(g_programObject, texUnit0, "bin5.frag");
texUnit1 = glGetUniformLocationARB(g_programObject, "texUnit1");
glGenTextures(2, g_nTexID);

unsigned short TEX_OPT1 = GL_TEXTURE_RECTANGLE_NV;
unsigned short TEX_OPT2 = GL_FLOAT_RGBA32_NV;

// ここから計算
uint *ary_bin;
uint *ary_dec;

// 何度開平できるか
int nRep = 0;
{
const int limit = 10000;
int n = nexp;
while((n & 1) == 0 && n > limit) {
n >>= 1;
nRep++;
}
}

// 開平後の2進データを作る
uint nbin = (nexp >> nRep) / max_bit + 1;
int ibin = (nexp >> nRep) % max_bit;
ary_bin = new uint[nbin];
for(int l = 0; l < nbin - 1; l++)
ary_bin[l] = 0;
ary_bin[nbin-1] = 1 << ibin;

// 開平後の10進データを作る
uint ndec = (int)(nexp / (log((double)max_dec) - log(2.0))) + 1;
ary_dec = new uint[ndec];
int max_dec_elem = 0;

int i = nbin - 1;
int k = ibin / multi_exp;
ary_dec[0] = trim(ary_bin[i], k);
if(ary_dec[0] >= max_dec) {
ary_dec[1] = ary_dec[0] / max_dec;
ary_dec[0] -= ary_dec[1] * max_dec;
max_dec_elem = 1;
}

k--;
for( ; i >= 0; i--) {
for( ; k >= 0; k--) {
multiple(ary_dec, max_dec_elem);
int n = trim(ary_bin[i], k);
add(n, ary_dec, max_dec_elem);
}
k = max_bit / multi_exp - 1;
}

// GPUで必要な回数平方する
int nWidth, nHeight;
calcWH(nWidth, nHeight, ndec);
int nData = nWidth * nHeight * 4;

CreateTexture(g_nTexID[0], nWidth, nHeight, TEX_OPT1, TEX_OPT2);
CreateTexture(g_nTexID[1], nWidth, nHeight, TEX_OPT1, TEX_OPT2);

glMatrixMode(GL_PROJECTION);
glLoadIdentity();
gluOrtho2D(0.0, nWidth, 0.0, nHeight);
glMatrixMode(GL_MODELVIEW);
glLoadIdentity();
glViewport(0, 0, nWidth, nHeight);

glFlush();

glTexEnvi(GL_TEXTURE_ENV, GL_TEXTURE_ENV_MODE, GL_REPLACE);
glFramebufferTexture2DEXT(GL_FRAMEBUFFER_EXT,
GL_COLOR_ATTACHMENT0_EXT, TEX_OPT1, g_nTexID[0], 0);
glUseProgramObjectARB(g_programObject);

// 配列をintからfloatに
float *data = new float[ndec*4*2];
for(int i = 0; i <= max_dec_elem; i++)
data[i*4] = (float)ary_dec[i];

for(int iRep = 0; iRep < nRep; iRep++) {
int num_elem = max_dec_elem * 2 + 1;
calcWH(nWidth, nHeight, num_elem);
setParameter1i(g_programObject, "bit", calcBit(nWidth));
setParameter1i(g_programObject, "num_elem", num_elem);

glBindTexture(TEX_OPT1, g_nTexID[1]);
glTexSubImage2D(TEX_OPT1, 0, 0, 0,
nWidth, nHeight, GL_RGBA, GL_FLOAT, data);

glActiveTexture(GL_TEXTURE0);
glBindTexture(TEX_OPT1, g_nTexID[0]);
glUniform1iARB(texUnit0, 0);
glActiveTexture(GL_TEXTURE1);
glBindTexture(TEX_OPT1, g_nTexID[1]);
glUniform1iARB(texUnit1, 1);

glDrawBuffer (GL_COLOR_ATTACHMENT0_EXT);

glBegin(GL_QUADS);
glTexCoord2f(0.0, 0.0);
glVertex2f (0.0, 0.0);
glTexCoord2f(nWidth, 0.0);
glVertex2f (nWidth, 0.0);
glTexCoord2f(nWidth, nHeight);
glVertex2f (nWidth, nHeight);
glTexCoord2f(0.0, nHeight);
glVertex2f (0.0, nHeight);
glEnd();

glFlush();
glReadBuffer(GL_COLOR_ATTACHMENT0_EXT);

glReadPixels(0,0, nWidth, nHeight, GL_RGBA, GL_FLOAT, data);

// 繰上り計算
max_dec_elem *= 2;
int carry[2];
carry[0] = carry[1] = 0;
for(int i = 0; i <= max_dec_elem; i++) {
const int div22 = (1 << 22) / max_dec;
const int mod22 = (1 << 22) % max_dec;
int res = (int)data[i*4+1];
int div = res / max_dec;
int mod = res - div * max_dec;
res = mod * mod22 + carry[0];
carry[0] = div * mod22 + mod * div22 + carry[1];
carry[1] = div * div22;

res += (int)data[i*4];
div = res / max_dec;
mod = res - div * max_dec;
data[i*4] = (float)mod;
carry[0] += div;
}
if(carry[0] != 0) {
max_dec_elem++;
data[max_dec_elem*4] = (float)carry[0];
}
}
glDeleteTextures(2, g_nTexID);

fprintf(stdout, "%4d", (int)data[max_dec_elem*4]);
for(int i = max_dec_elem - 1; i >= 0; i--)
fprintf(stdout, " %04d", (int)data[i*4]);
fprintf(stdout, "\n");
delete [] ary_bin;
delete [] ary_dec;
delete [] data;
glutDestroyWindow(g_glutWindowHandle);

return 0;
}

int trim(uint a, int k) {
int l = multi_exp * k;
return (a >> l) & ((1 << multi_exp) - 1);
}

void multiple(uint *ary, int& n) {
int carry = 0;
for(int i = 0; i <= n; i++) {
ary[i] <<= multi_exp;
ary[i] += carry;
carry = ary[i] / max_dec;
ary[i] -= carry * max_dec;
}

if(carry != 0) {
if(carry >= max_dec) {
n += 2;
ary[n] = carry / max_dec;
ary[n-1] = carry - ary[n] * max_dec;
}
else {
n++;
ary[n] = carry;
}
}
}

void add(int a, uint *ary, int& n) {
int carry = a;
for(int i = 0; i <= n; i++) {
ary[i] += carry;
carry = ary[i] / max_dec;
if(carry != 0)
ary[i] -= carry * max_dec;
else
break;
}

if(carry) {
n++;
ary[n] = 1;
}
}

void calcWH(int& width, int& height, int n) {
if(n == 0) {
width = height = 0;
return;
}

width = height = 1;
n--;
while(n) {
width <<= 1;
n >>= 1;
if(n == 0)
break;
height <<= 1;
n >>= 1;
}
}

int calcBit(int n) {
int bit = 0;
while(n > 1) {
bit++;
n >>= 1;
}
return bit;
}