MojoでProject Euler 31(4)

https://projecteuler.net/problem=31

前回、同じようなコード、

            for i in range(e, self.L-width, width):
                var e1 = self.a.load[width=nelts](i)
                var e2 = self.a.load[width=nelts](i-e)
                self.a.store[width=nelts](i, (e1+e2)%D)

が3つ出てきましたが、次のようにすれば同じコードにできます。ただし、これでもneltsが16だとうまくいきません。一般に書くのは難しそうです。

from math import max
import sys


#################### constatns ####################

# number of elements
alias nelts = simdwidthof[DType.int32]()
alias D: SIMD[DType.int32, 1] = 10**9+7


#################### Polynomial ####################

struct Polynomial:
    var a: DTypePointer[DType.int32]
    var L: Int
    
    fn __init__(inout self, a: DTypePointer[DType.int32], L: Int):
        self.a = a
        self.L = L
    
    fn __del__(owned self):
        print("__del__")
        pass
    
    fn __str__(self) -> String:
        var s = str(self.a[0])
        for i in range(1, self.L):
            s += " " + str(self.a[i])
        return s
    
    fn value(self, i: Int) -> SIMD[DType.int32, 1]:
        return self.a.load(i)
    
    fn divide_each[width: Int](inout self, e: Int):
        for i in range(e, self.L-width, width):
            var e1 = self.a.load[width=width](i)
            var e2 = self.a.load[width=width](i-e)
            self.a.store[width=width](i, (e1+e2)%D)
    
    fn divide(inout self, e: Int):
        fn get_width(w: Int) -> Int:
            var width = nelts
            while w < width:
                width >>= 1
            return width
        
        var width = get_width(e)
        if e >= nelts:
            self.divide_each[nelts](e)
        elif e >= 4:
            self.divide_each[4](e)
        elif e >= 2:
            self.divide_each[2](e)
        
        var first = max(e, e+(self.L-e-width-1)//width*width+width)
        for j in range(first, self.L):
            var e1 = self.a.load(j)
            var e2 = self.a.load(j-e)
            self.a.store(j, (e1+e2)%D)


#################### process ####################

fn collect_coins(N: Int) -> List[Int]:
    var ns = List[Int](1, 2, 5);
    var ps = List[Int]()
    for e in range(19):
        for n in ns:
            var p = n[] * 10**e
            if p > N:
                break
            ps.append(p)
    return ps

# 1/(1-x)
fn initialize(N: Int) -> Polynomial:
    var L = N // nelts + 1
    var ones = SIMD[DType.int32, nelts](1)
    var a = SIMD[DType.int32, nelts](1)
    var p = DTypePointer[DType.int32].alloc(L*nelts)
    for i in range(0, L*nelts, nelts):
        p.store[width=nelts](i, a)
    return Polynomial(p, L*nelts)

fn f(N: Int) -> SIMD[DType.int32, 1]:
    var p = initialize(N)
    var ps = collect_coins(N)
    for i in range(1, ps.size):
        p.divide(ps[i])
    
    return p.value(N)

fn main() raises:
    var args = sys.argv()
    var N = atol(args[1])
    print(f(N))