# Taichi version
import taichi as ti
import time
ti.init()
nx, ny = 10000,10000
p = ti.field(dtype=ti.f32, shape=(nx,ny))
@ti.kernel
def main():
for i,j in p:
p[i,j] = p[i,j] + 0.1
start = time.time()
main()
print(time.time()-start)
得到的结果是约1.3 sec。
然后写了一个numpy版本的同样的运算,如下:
# numpy version
import numpy as np
import time
nx, ny = 10000,10000
pnp = np.zeros((nx,ny))
def main():
for i in range(nx):
for j in range(ny):
pnp[i,j] = pnp[i,j] + 0.1
start = time.time()
main()
print(time.time()-start)
结果发现大约是34秒,和Taichi比相差了近30倍。。
这里想问两个问题:
我的CPU只是一个i7-6600U,为何同样的运算会相差30多倍 我发现,当矩阵越小的时候结果就会越接近,哪怕是1000 x 1000的矩阵,两者就会降到几乎相同的数量级上。这是合理的现象吗?如果是的话,为什么?
# Taichi version
import taichi as ti
import time
ti.init()
nx, ny = 10000,10000
p = ti.field(dtype=ti.f64, shape=(nx,ny))
@ti.kernel
def main():
for i,j in p:
p[i,j] = p[i,j] + 0.1
main() # JIT compile
start = time.time()
main()
print(time.time()-start)
# numpy version
import numpy as np
import time
nx, ny = 10000,10000
pnp = np.zeros((nx,ny))
_ = pnp + 1 # cache warm up
start = time.time()
pnp = pnp + 0.1
print(time.time()-start)
# numba version
import numpy as np
import numba # 一个看似和Taichi很相似的包,不过它是针对更广泛的数据类型的,比如numpy array甚至是Python自带的list
import time
nx, ny = 10000,10000
pnp = np.zeros((nx,ny))
@numba.jit(numba.void(numba.f8[:]))
def main(pnp):
for i in range(nx):
for j in range(ny):
pnp[i,j] = pnp[i,j] + 0.1
main(np.zeros((nx, ny))) # JIT compile
start = time.time()
main(pnp)
print(time.time()-start)
sum = 0
start thread 1:
for i from 1 to 25:
sum += i
start thread 2:
for i from 26 to 50:
sum += i
start thread 3:
for i from 51 to 75:
sum += i
start thread 4:
for i from 76 to 100:
sum += i
wait threads done
print sum