関数の速度（9） - inamori’s diary

前回の場合分けの部分も並列処理をする。
SSEに分岐などないので、そこは工夫が必要である。

方針としては、
xy平面を象限に分けて考える。
第4象限は3π/4回転して、x >= y && x + y >= 0の領域に持ってくる。
同様に他の象限も回転する。
そこで素のatan2の計算を行い、さっきの回転した分だけ減じる。

回転する角度はどう決めるかというと、
角度はxとyの符号で決まるから、その符号を取り出す。



#define ALIGNED __declspec(align(16))

    ...

    ALIGNED int     mask[4] = {

            0x80000000, 0x80000000, 0x80000000, 0x80000000

    };

    

    __asm {

        movaps  xmm2,   [x1]

        movaps  xmm3,   [y1]

        movaps  xmm4,   xmm2    ; x

        movaps  xmm5,   xmm3    ; y

        movaps  xmm6,   [mask]

        andps   xmm4,   xmm6

        andps   xmm5,   xmm6

        ...

    }

xとyをそれぞれ0x80000000とandを取る。
そうすると、floatは最上位ビットが立っていれば負、さもなくば正だから、
負なら0x80000000、正なら0となる。
これと、0x3f000000とorを取ると、
負なら0xbf000000、正なら0x3f000000となり、
これはfloatで、それぞれ-0.5と0.5を表す。



    ALIGNED int     ex[4] = {

            0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000

    };

        ...

        movaps  xmm6,   [ex]

        orps    xmm4,   xmm6    ; b1

        orps    xmm5,   xmm6    ; b2

xのでこの変換を施したものをb₁、同様のyをb₂とする。
これで各象限を表現できる。
b₁とb₂が共に-0.5なら第3象限となる。
以下では、第3,4,1,2象限を順番に並べたベクトルを用いる。
すなわち、

b₁ = ( -0.5, 0.5, 0.5, -0.5 )
b₂ = ( -0.5, -0.5, 0.5, 0.5 )

ここで次の演算を行う。

c = b₁b₂ - b₂

すると、

c = ( 0.75, 0.25, -0.25, -0.75 )

となり、これにπをかければ回転角度となる。
また、回転の行列計算なcos/sinは、

cos = √2b₁
sin = -√2b₂

となる。

しかし、この計算を行ったところ44ns程度と、SSEを用いなかったときより遅くなった。
ただ、前回の場合わけを行った場合よりは約3倍速くなった。前回、実家では速かったので、コンパイラが何かしているのかCPUの違いがあるのか、何か起こっているにちがいない。



void atan13(float *z, const float *y, const float *x) {

    ALIGNED float   x1[4], y1[4], z1[4];

    memcpy(x1, x, 16);

    memcpy(y1, y, 16);

    ALIGNED int     mask[4] = { 0x80000000, 0x80000000, 0x80000000, 0x80000000};

    ALIGNED int     ex[4] = { 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000 };

    ALIGNED float   pi[4] = { M_PI, M_PI, M_PI, M_PI };

    ALIGNED float   sqrt2[4] = { sqrt(2.0f), sqrt(2.0f), sqrt(2.0f), sqrt(2.0f) };

    

    ALIGNED float   a0[4] = { 256.0f, 256.0f, 256.0f, 256.0f };

    ALIGNED float   a1[4] = { 5943.0f, 5943.0f, 5943.0f, 5943.0f };

    ALIGNED float   a2[4] = { 19250.0f, 19250.0f, 19250.0f, 19250.0f };

    ALIGNED float   a3[4] = { 15015.0f, 15015.0f, 15015.0f, 15015.0f };

    ALIGNED float   b0[4] = { 1225.0f, 1225.0f, 1225.0f, 1225.0f };

    ALIGNED float   b1[4] = { 11025.0f, 11025.0f, 11025.0f, 11025.0f };

    ALIGNED float   b2[4] = { 24255.0f, 24255.0f, 24255.0f, 24255.0f };

    

    ALIGNED float   a[4];

    

    __asm {

        movaps  xmm2,   [x1]

        movaps  xmm0,   [pi]

        movaps  xmm3,   [y1]

        movaps  xmm4,   xmm2    ; x

        movaps  xmm5,   xmm3    ; y

        movaps  xmm6,   [mask]

        andps   xmm4,   xmm6

        andps   xmm5,   xmm6

        movaps  xmm6,   [ex]

        orps    xmm4,   xmm6    ; b1

        orps    xmm5,   xmm6    ; b2

        movaps  xmm7,   xmm4

        mulps   xmm7,   xmm5

        subps   xmm7,   xmm5    ; c

        mulps   xmm7,   xmm0    ; rotate_angle

        movaps  xmm0,   [sqrt2]

        mulps   xmm4,   xmm0    ; cos

        mulps   xmm5,   xmm0    ; -sin

        

        // rotate

        movaps  xmm1,   xmm2

        mulps   xmm1,   xmm4

        movaps  xmm0,   xmm3

        mulps   xmm0,   xmm5

        addps   xmm1,   xmm0    ; x = x*cos-y*sin

        movaps  xmm0,   xmm3

        mulps   xmm0,   xmm4

        mulps   xmm2,   xmm5

        subps   xmm0,   xmm2    ; y = y*cos+x*sin

        

        movaps  xmm2,   xmm0

        movaps  xmm3,   xmm1

        mulps   xmm2,   xmm2    ; y^2

        mulps   xmm3,   xmm3    ; x^2

        movaps  xmm6,   [a3]

        movaps  xmm4,   xmm3

        mulps   xmm4,   xmm6    ; xmm4 = x^2 * 15015

        movaps  xmm5,   xmm4    ; xmm5 = xmm4

        movaps  xmm6,   [a2]

        mulps   xmm6,   xmm2

        addps   xmm4,   xmm6    ; xmm4 += a1 * y^2

        mulps   xmm4,   xmm3    ; xmm4 *= x^2

        movaps  xmm6,   [b2]

        mulps   xmm6,   xmm2

        addps   xmm5,   xmm6    ; xmm5 += a1 * y^2

        mulps   xmm5,   xmm3    ; xmm5 *= x^2

        movaps  xmm6,   [a1]

        mulps   xmm6,   xmm2

        mulps   xmm6,   xmm2

        addps   xmm4,   xmm6

        mulps   xmm4,   xmm3

        movaps  xmm6,   [b1]

        mulps   xmm6,   xmm2

        mulps   xmm6,   xmm2

        addps   xmm5,   xmm6

        mulps   xmm5,   xmm3

        movaps  xmm6,   [a0]

        mulps   xmm6,   xmm2

        mulps   xmm6,   xmm2

        mulps   xmm6,   xmm2

        addps   xmm4,   xmm6

        mulps   xmm4,   xmm3

        movaps  xmm6,   [b0]

        mulps   xmm6,   xmm2

        mulps   xmm6,   xmm2

        mulps   xmm6,   xmm2

        addps   xmm5,   xmm6

        mulps   xmm5,   xmm3

        mulps   xmm4,   xmm0    ; xmm4 *= y

        mulps   xmm5,   xmm1    ; xmm5 *= x

        rcpps   xmm6,   xmm5    ; xmm6 = 1 / xmm5

        movaps  xmm1,   xmm6

        mulps   xmm1,   xmm1

        mulps   xmm1,   xmm5    ; xmm1 = xmm6^2 * xmm5

        addps   xmm6,   xmm6    ; xmm6 += xmm6

        subps   xmm6,   xmm1    ; xmm6 -= xmm1

        mulps   xmm4,   xmm6

        movaps  a,      xmm4

        subps   xmm4,   xmm7    ; xmm4 += rotate_angle

        movaps  z1,     xmm4    ; z = xmm4

    }

    

    for(int i = 0; i < 4; i++) {

        z[i] = z1[i];

    }

}